diff --git a/.travis.yml b/.travis.yml index 6588db3e7..75179d4e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,9 +18,11 @@ install: os: - linux + # command to run tests, e.g. python setup.py test script: coverage run --source=pythainlp setup.py test + after_success: coveralls diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 62ecbcbb3..a10d62615 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,15 +19,14 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod ## Code Guidelines -- Use [PEP8](http://www.python.org/dev/peps/pep-0008/); +- Follows [PEP8](http://www.python.org/dev/peps/pep-0008/), use [black](https://github.com/ambv/black); - Write tests for your new features (please see "Tests" topic below); - Always remember that [commented code is dead code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); - Name identifiers (variables, classes, functions, module names) with meaningful and pronounceable names (`x` is always wrong); -- When manipulating strings, use [Python's new-style - formatting](http://docs.python.org/library/string.html#format-string-syntax) - (`'{} = {}'.format(a, b)` instead of `'%s = %s' % (a, b)`); +- When manipulating strings, use [f-String](https://www.python.org/dev/peps/pep-0498/) + (use `"{a} = {b}"`, instead of `"{} = {}".format(a, b)` and `"%s = %s' % (a, b)"`); - All `#TODO` comments should be turned into issues (use our [GitHub issue system](https://github.com/PyThaiNLP/pythainlp/)); - Run all tests before pushing (just execute `tox`) so you will know if your diff --git a/Makefile b/Makefile index d5c977215..0f103632c 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,6 @@ help: clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts - clean-build: ## remove build artifacts rm -fr build/ rm -fr dist/ @@ -51,19 +50,16 @@ lint: ## check style with flake8 flake8 pythainlp tests test: ## run tests quickly with the default Python - - python setup.py test + python setup.py test test-all: ## run tests on every Python version with tox tox coverage: ## check code coverage quickly with the default Python - - coverage run --source pythainlp setup.py test - - coverage report -m - coverage html - $(BROWSER) htmlcov/index.html + coverage run --source pythainlp setup.py test + coverage report -m + coverage html + $(BROWSER) htmlcov/index.html release: clean ## package and upload a release python setup.py sdist upload diff --git a/README.md b/README.md index 880a579ca..fd2c26acb 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil **This is a document for development branch (post 1.7.x). Things will break. For a stable branch document, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** -ðŸ“Ŧ follow us on Facebook [Pythainlp](https://www.facebook.com/pythainlp/) +ðŸ“Ŧ follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/) ## Capabilities @@ -34,7 +34,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil - Thai misspellings detection and spelling correction (```spell```) - Thai soundex (```lk82```, ```udom83```, ```metasound```) - Thai WordNet wrapper -- and much more - see [examples](https://github.com/PyThaiNLP/pythainlp/tree/dev/examples). +- and much more - see examples in [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb). ## Installation @@ -125,7 +125,7 @@ PyThaiNLP āđ€āļ›āđ‡āļ™āđ„āļĨāļšāļēāļĢāļĩāļ āļēāļĐāļēāđ„āļžāļ—āļ­āļ™āđ€āļžāļ·āđˆ - āļ•āļĢāļ§āļˆāļ„āļģāļŠāļ°āļāļ”āļœāļīāļ”āđƒāļ™āļ āļēāļĐāļēāđ„āļ—āļĒ (```spell```) - soundex āļ āļēāļĐāļēāđ„āļ—āļĒ (```lk82```, ```udom83```, ```metasound```) - Thai WordNet wrapper -- āđāļĨāļ°āļ­āļ·āđˆāļ™ āđ† [āļ”āļđāļ•āļąāļ§āļ­āļĒāđˆāļēāļ‡](https://github.com/PyThaiNLP/pythainlp/tree/dev/examples) +- āđāļĨāļ°āļ­āļ·āđˆāļ™ āđ† āļ”āļđāļ•āļąāļ§āļ­āļĒāđˆāļēāļ‡āđ„āļ”āđ‰āđƒāļ™ [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) ## āļ•āļīāļ”āļ•āļąāđ‰āļ‡ diff --git a/conda.recipe/meta-old.yaml b/conda.recipe/meta-old.yaml deleted file mode 100644 index 632fb2109..000000000 --- a/conda.recipe/meta-old.yaml +++ /dev/null @@ -1,49 +0,0 @@ -{% set version = "1.7.2" %} - -package: - name: pythainlp - version: {{ version }} - -build: - noarch: python - number: 0 - script: python -m pip install --no-deps --ignore-installed . - -requirements: - host: - - pip - - python - - setuptools - - nltk - - future - - six - - marisa_trie - - dill - - pytz - - tinydb - - tqdm - - - run: - - python - - nltk - - future - - six - - marisa_trie - - dill - - pytz - - tinydb - - tqdm - -test: - imports: - - pvlib - -about: - home: https://github.com/PyThaiNLP/pythainlp - license: Apache License 2.0 - summary: 'Thai Natural Language Processing in Python.' - -extra: - recipe-maintainers: - - pythainlp diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 8e36acad6..ff5babfff 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "1.7.2" %} +{% set version = "2.0.1" %} package: name: pythainlp diff --git a/docs/api/spell.rst b/docs/api/spell.rst index 7544a58d5..b2c77736b 100644 --- a/docs/api/spell.rst +++ b/docs/api/spell.rst @@ -8,8 +8,4 @@ Modules ------- .. autofunction:: spell -.. autofunction:: pythainlp.spell.pn.spell -.. autofunction:: pythainlp.spell.pn.prob -.. autofunction:: pythainlp.spell.pn.correct -.. autofunction:: pythainlp.spell.pn.known -.. autofunction:: pythainlp.spell.pn.dictionary +.. autofunction:: correct diff --git a/docs/api/util.rst b/docs/api/util.rst index 1906fe48d..166f52375 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -14,9 +14,9 @@ Modules .. autofunction:: digit_to_text .. autofunction:: eng_to_thai .. autofunction:: find_keyword -.. autofunction:: is_thai -.. autofunction:: is_thaichar -.. autofunction:: is_thaiword +.. autofunction:: countthai +.. autofunction:: isthai +.. autofunction:: isthaichar .. autofunction:: normalize .. autofunction:: now_reign_year .. autofunction:: num_to_thaiword diff --git a/docs/pythainlp-1-3-thai.md b/docs/archive/pythainlp-1-3-thai.md similarity index 100% rename from docs/pythainlp-1-3-thai.md rename to docs/archive/pythainlp-1-3-thai.md diff --git a/docs/pythainlp-1-4-eng.md b/docs/archive/pythainlp-1-4-eng.md similarity index 100% rename from docs/pythainlp-1-4-eng.md rename to docs/archive/pythainlp-1-4-eng.md diff --git a/docs/pythainlp-1-4-eng.pdf b/docs/archive/pythainlp-1-4-eng.pdf similarity index 100% rename from docs/pythainlp-1-4-eng.pdf rename to docs/archive/pythainlp-1-4-eng.pdf diff --git a/docs/pythainlp-1-4-thai.md b/docs/archive/pythainlp-1-4-thai.md similarity index 100% rename from docs/pythainlp-1-4-thai.md rename to docs/archive/pythainlp-1-4-thai.md diff --git a/docs/pythainlp-1-4-thai.pdf b/docs/archive/pythainlp-1-4-thai.pdf similarity index 100% rename from docs/pythainlp-1-4-thai.pdf rename to docs/archive/pythainlp-1-4-thai.pdf diff --git a/docs/pythainlp-1-5-eng.md b/docs/archive/pythainlp-1-5-eng.md similarity index 100% rename from docs/pythainlp-1-5-eng.md rename to docs/archive/pythainlp-1-5-eng.md diff --git a/docs/pythainlp-1-5-thai.md b/docs/archive/pythainlp-1-5-thai.md similarity index 100% rename from docs/pythainlp-1-5-thai.md rename to docs/archive/pythainlp-1-5-thai.md diff --git a/docs/pythainlp-1-6-eng.md b/docs/archive/pythainlp-1-6-eng.md similarity index 100% rename from docs/pythainlp-1-6-eng.md rename to docs/archive/pythainlp-1-6-eng.md diff --git a/docs/pythainlp-1-6-thai.md b/docs/archive/pythainlp-1-6-thai.md similarity index 100% rename from docs/pythainlp-1-6-thai.md rename to docs/archive/pythainlp-1-6-thai.md diff --git a/docs/pythainlp-1-7.md b/docs/archive/pythainlp-1-7.md similarity index 100% rename from docs/pythainlp-1-7.md rename to docs/archive/pythainlp-1-7.md diff --git a/docs/pythainlp-dev-thai.md b/docs/archive/pythainlp-dev-thai.md similarity index 100% rename from docs/pythainlp-dev-thai.md rename to docs/archive/pythainlp-dev-thai.md diff --git a/docs/whatsnew-1.7.md b/docs/whatsnew-1.7.md deleted file mode 100644 index 768b6f450..000000000 --- a/docs/whatsnew-1.7.md +++ /dev/null @@ -1,12 +0,0 @@ -# āļĄāļĩāļ­āļ°āđ„āļĢāđƒāļŦāļĄāđˆāđƒāļ™ PyThaiNLP 1.7 - -## āļŠāļĢāļļāļ›āļ›āļĢāļ°āđ€āļ”āđ‡āļ™āļŠāļģāļ„āļąāļ - -- āđ€āļĨāļīāļāļŠāļ™āļąāļšāļŠāļ™āļļāļ™ Python 2.7 āļ­āļĒāđˆāļēāļ‡āđ€āļ›āđ‡āļ™āļ—āļēāļ‡āļāļēāļĢ -- āđ€āļžāļīāđˆāļĄ ULMFit utility -- āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āļĢāļ°āļšāļšāļ•āļąāļ”āļ„āļģāđƒāļŦāļĄāđˆ āļ—āļąāđ‰āļ‡ newmm āđāļĨāļ° mm -- thai2vec 0.2 -- sentiment analysis āļ•āļąāļ§āđƒāļŦāļĄāđˆāļ—āļģāļ‡āļēāļ™āļ”āđ‰āļ§āļĒ deep learning -- āđ€āļžāļīāđˆāļĄ thai2rom āđ€āļ›āđ‡āļ™ Thai romanization āļ—āļģāļ”āđ‰āļ§āļĒ deep learning āđƒāļ™āļĢāļ°āļ”āļąāļšāļ•āļąāļ§āļ­āļąāļāļĐāļĢ - -āļāļģāļĨāļąāļ‡āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡... diff --git a/examples/collate.py b/examples/collate.py deleted file mode 100644 index d4e30525e..000000000 --- a/examples/collate.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.util import collate - -print(collate(["āđ„āļāđˆ", "āđ„āļ‚āđˆ", "āļ", "āļŪāļē"])) # ['āļ', 'āđ„āļāđˆ', 'āđ„āļ‚āđˆ', 'āļŪāļē'] diff --git a/examples/date.py b/examples/date.py deleted file mode 100644 index 888d9c178..000000000 --- a/examples/date.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -import datetime -from pythainlp.util import thai_strftime - -fmt = "%Aāļ—āļĩāđˆ %-d %B āļž.āļĻ. %Y āđ€āļ§āļĨāļē %H:%Māļ™. (%a %d-%b-%y)" -date = datetime.datetime(1976, 10, 6, 1, 40) - -# āļ§āļąāļ™āļžāļļāļ˜āļ—āļĩāđˆ 6 āļ•āļļāļĨāļēāļ„āļĄ āļž.āļĻ. 2519 āđ€āļ§āļĨāļē 01:40āļ™. (āļž 06-āļ•.āļ„.-19) -print(thai_strftime(date, fmt)) diff --git a/examples/etcc.py b/examples/etcc.py deleted file mode 100644 index f732fdf11..000000000 --- a/examples/etcc.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import etcc - -print(etcc.etcc("āļ„āļ·āļ™āļ„āļ§āļēāļĄāļŠāļļāļ‚")) # /āļ„āļ·āļ™/āļ„āļ§āļēāļĄāļŠāļļāļ‚ diff --git a/examples/ner.py b/examples/ner.py deleted file mode 100644 index 773859e84..000000000 --- a/examples/ner.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tag.named_entity import ThaiNameTagger -ner = ThaiNameTagger() -print(ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.")) diff --git a/examples/normalize.py b/examples/normalize.py deleted file mode 100644 index cac000306..000000000 --- a/examples/normalize.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.util import normalize - -print(normalize("āđ€āđ€āļ›āļĨāļ") == "āđāļ›āļĨāļ") # āđ€ āđ€ āļ› āļĨ āļ āļāļąāļš āđāļ›āļĨāļ diff --git a/examples/soundex.py b/examples/soundex.py deleted file mode 100644 index 9864ac747..000000000 --- a/examples/soundex.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.soundex import lk82, metasound, udom83 - -texts = ["āļšāļđāļĢāļ“āļ°", "āļšāļđāļĢāļ“āļāļēāļĢ", "āļĄāļąāļ", "āļĄāļąāļ„", "āļĄāļĢāļĢāļ„", "āļĨāļąāļ", "āļĢāļąāļ", "āļĢāļąāļāļĐāđŒ", ""] -for text in texts: - print( - "{} - lk82: {} - udom83: {} - metasound: {}".format( - text, lk82(text), udom83(text), metasound(text) - ) - ) - -# check equivalence -print(lk82("āļĢāļ–") == lk82("āļĢāļ”")) -print(udom83("āļ§āļĢāļĢ") == udom83("āļ§āļąāļ™")) -print(metasound("āļ™āļž") == metasound("āļ™āļ ")) diff --git a/examples/spell.py b/examples/spell.py deleted file mode 100644 index 92dbc49f3..000000000 --- a/examples/spell.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.corpus import ttc -from pythainlp.spell import spell -from pythainlp.spell.pn import NorvigSpellChecker -from pythainlp.spell.pn import correct as pn_tnc_correct -from pythainlp.spell.pn import spell as pn_tnc_spell - -# spell checker from pythainlp.spell module (generic) -print(spell("āļŠāļĩāđˆāđ€āļŦāļĨāļĩāļĒāļĄ")) # ['āļŠāļĩāđˆāđ€āļŦāļĨāļĩāđˆāļĒāļĄ'] - -# spell checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's) -print(pn_tnc_spell("āđ€āļŦāļĨāļ·āļĒāļĄ")) -print(pn_tnc_correct("āđ€āļŦāļĨāļ·āļĒāļĄ")) - - -# spell checker from pythainlp.spell.pn module (specified algorithm, custom dictionary) -ttc_word_freqs = ttc.word_freqs() -pn_ttc_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs) -print(pn_ttc_checker.spell("āđ€āļŦāļĨāļ·āļĒāļĄ")) -print(pn_ttc_checker.correct("āđ€āļŦāļĨāļ·āļĒāļĄ")) - -# apply different dictionary filter when creating spell checker -pn_tnc_checker = NorvigSpellChecker() -print(len(pn_tnc_checker.dictionary())) -pn_tnc_checker_no_filter = NorvigSpellChecker(dict_filter=None) -print(len(pn_tnc_checker_no_filter.dictionary())) diff --git a/examples/tcc.py b/examples/tcc.py deleted file mode 100644 index 4d95aed43..000000000 --- a/examples/tcc.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import tcc - -print(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")) # āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ - -print(tcc.tcc_pos("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")) # {1, 3, 5, 6, 8, 9} - -for ch in tcc.tcc_gen("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"): # āļ›-āļĢāļ°-āđ€āļ—-āļĻ-āđ„āļ—-āļĒ- - print(ch, end='-') diff --git a/examples/tokenize.py b/examples/tokenize.py deleted file mode 100644 index 0b8a0d00b..000000000 --- a/examples/tokenize.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import sent_tokenize, word_tokenize - -text = "āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ " -print(text) - -print(sent_tokenize(text)) -# ['āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ', ''] - -print(word_tokenize(text)) -# ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' '] - -print(word_tokenize(text, whitespaces=False)) -# ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ'] - -text2 = "āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™" -print(text2) - -print(word_tokenize(text2)) -# ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™'] - -print(word_tokenize(text2, engine="longest")) -# ['āļāļŽāļŦāļĄāļēāļĒ', 'āđāļĢāļ‡āļ‡āļēāļ™'] diff --git a/examples/transliterate.py b/examples/transliterate.py deleted file mode 100644 index 97fb4e7f1..000000000 --- a/examples/transliterate.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.transliterate import romanize, transliterate - -print(romanize("āđāļĄāļ§")) -print(transliterate("āđāļĄāļ§")) diff --git a/meta.yaml b/meta.yaml index 0bc914207..651053e4e 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "1.7.2" %} +{% set version = "2.0.1" %} package: name: pythainlp diff --git a/notebooks/pythainlp-get-started.ipynb b/notebooks/pythainlp-get-started.ipynb new file mode 100644 index 000000000..806b9e47d --- /dev/null +++ b/notebooks/pythainlp-get-started.ipynb @@ -0,0 +1,1077 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyThaiNLP Get Started\n", + "\n", + "Code examples for basic functions in PyThaiNLP https://github.com/PyThaiNLP/pythainlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Thai Characters\n", + "\n", + "PyThaiNLP provides some ready-to-use Thai character set (e.g. Thai consonants, vowels, tonemarks, symbols) as a string for convenience. There are also few utility functions to test if a string is in Thai or not." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪāļĪāļĶāļ°āļąāļēāļģāļīāļĩāļķāļ·āļļāļđāđ€āđāđ‚āđƒāđ„āđ…āđ‡āđˆāđ‰āđŠāđ‹āļŊāđ†āļšāđŒāđāđŽāđāđšāđ›āđāđ‘āđ’āđ“āđ”āđ•āđ–āđ—āđ˜āđ™āļŋ'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp\n", + "\n", + "pythainlp.thai_characters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.thai_consonants" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"āđ”\" in pythainlp.thai_digits" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp.util\n", + "\n", + "pythainlp.util.isthai(\"āļ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.isthai(\"(āļ.āļž.)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.isthai(\"(āļ.āļž.)\", ignore_chars=\".()\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.countthai(\"āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.85714285714286" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.countthai(\"āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562\", ignore_chars=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Collation\n", + "\n", + "Sorting according to Thai dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļāļĢāļĢāđ„āļāļĢ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āđ„āļ‚āđˆ', 'āļ„āđ‰āļ­āļ™', 'āļœāđ‰āļēāđ„āļŦāļĄ']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import collate\n", + "\n", + "thai_words = [\"āļ„āđ‰āļ­āļ™\", \"āļāļĢāļ°āļ”āļēāļĐ\", \"āļāļĢāļĢāđ„āļāļĢ\", \"āđ„āļ‚āđˆ\", \"āļœāđ‰āļēāđ„āļŦāļĄ\"]\n", + "collate(thai_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļœāđ‰āļēāđ„āļŦāļĄ', 'āļ„āđ‰āļ­āļ™', 'āđ„āļ‚āđˆ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āļāļĢāļĢāđ„āļāļĢ']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collate(thai_words, reverse=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date and Time Format\n", + "\n", + "Get Thai day and month names with Thai Buddhist Era (B.E.)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļ§āļąāļ™āļžāļļāļ˜āļ—āļĩāđˆ 6 āļ•āļļāļĨāļēāļ„āļĄ āļž.āļĻ. 2519 āđ€āļ§āļĨāļē 01:40 āļ™. (āļž 06-āļ•.āļ„.-19)'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import datetime\n", + "from pythainlp.util import thai_strftime\n", + "\n", + "fmt = \"%Aāļ—āļĩāđˆ %-d %B āļž.āļĻ. %Y āđ€āļ§āļĨāļē %H:%M āļ™. (%a %d-%b-%y)\"\n", + "date = datetime.datetime(1976, 10, 6, 1, 40)\n", + "\n", + "thai_strftime(date, fmt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Thai Character Cluster (TCC) and Extended TCC\n", + "\n", + "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļ›', 'āļĢāļ°', 'āđ€āļ—', 'āļĻ', 'āđ„āļ—', 'āļĒ']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import tcc\n", + "\n", + "tcc.tcc(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1, 3, 5, 6, 8, 9}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tcc.tcc_pos(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ/" + ] + } + ], + "source": [ + "for ch in tcc.tcc_gen(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\"):\n", + " print(ch, end='/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentence and Word\n", + "\n", + "Default word tokenizer (\"newmm\") use maximum matching algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sent_tokenize: ['āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ']\n", + "word_tokenize: ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ']\n", + "word_tokenize, without whitespace: ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ']\n" + ] + } + ], + "source": [ + "from pythainlp import sent_tokenize, word_tokenize\n", + "\n", + "text = \"āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ \"\n", + "\n", + "print(\"sent_tokenize:\", sent_tokenize(text))\n", + "print(\"word_tokenize:\", word_tokenize(text))\n", + "print(\"word_tokenize, without whitespace:\", word_tokenize(text, whitespaces=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other algorithm can be chosen. We can also create a tokenizer with custom dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "newmm: ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻ', 'āđƒāļŠāđ‰āđāļĨāđ‰āļ§']\n", + "longest: ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰', 'āđāļĨāđ‰āļ§']\n", + "custom: ['āļāļŽ', 'āļŦāļĄāļēāļĒāđāļĢāļ‡', 'āļ‡āļēāļ™', 'āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§']\n" + ] + } + ], + "source": [ + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text)) # default engine is \"newmm\"\n", + "print(\"longest:\", word_tokenize(text, engine=\"longest\"))\n", + "\n", + "words = [\"āļāļŽ\", \"āļ‡āļēāļ™\"]\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Default word tokenizer use a word list from pythainlp.corpus.common.thai_words().\n", + "We can get that list, add/remove words, and create new tokenizer from the modified list." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "newmm: ['āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļī', 'āļĄāļ­', 'āļŸ']\n", + "custom: ['āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļīāļĄāļ­āļŸ']\n" + ] + } + ], + "source": [ + "from pythainlp.corpus.common import thai_words\n", + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"āđ„āļ­āđāļ‹āļ„ āļ­āļŠāļīāļĄāļ­āļŸ\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text))\n", + "\n", + "words = set(thai_words()) # thai_words() returns frozenset\n", + "words.add(\"āļ­āļŠāļīāļĄāļ­āļŸ\")\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transliteration" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'maeo'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.transliterate import romanize\n", + "\n", + "romanize(\"āđāļĄāļ§\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mɛːw\n" + ] + } + ], + "source": [ + "from pythainlp.transliterate import transliterate\n", + "\n", + "print(transliterate(\"āđāļĄāļ§\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalization" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import normalize\n", + "\n", + "normalize(\"āđ€āđ€āļ›āļĨāļ\") == \"āđāļ›āļĨāļ\" # āđ€ āđ€ āļ› āļĨ āļ vs āđāļ›āļĨāļ" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Soundex\n", + "\n", + "\"Soundex is a phonetic algorithm for indexing names by sound.\" ([Wikipedia](https://en.wikipedia.org/wiki/Soundex)). PyThaiNLP provides three kinds of Thai soundex." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "from pythainlp.soundex import lk82, metasound, udom83\n", + "\n", + "# check equivalence\n", + "print(lk82(\"āļĢāļ–\") == lk82(\"āļĢāļ”\"))\n", + "print(udom83(\"āļ§āļĢāļĢ\") == udom83(\"āļ§āļąāļ™\"))\n", + "print(metasound(\"āļ™āļž\") == metasound(\"āļ™āļ \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āļšāļđāļĢāļ“āļ° - lk82: āļšE400 - udom83: āļš930000 - metasound: āļš550\n", + "āļšāļđāļĢāļ“āļāļēāļĢ - lk82: āļšE419 - udom83: āļš931900 - metasound: āļš551\n", + "āļĄāļąāļ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100\n", + "āļĄāļąāļ„ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100\n", + "āļĄāļĢāļĢāļ„ - lk82: āļĄ1000 - udom83: āļĄ310000 - metasound: āļĄ551\n", + "āļĨāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĨ100\n", + "āļĢāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100\n", + "āļĢāļąāļāļĐāđŒ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100\n", + " - lk82: - udom83: - metasound: \n" + ] + } + ], + "source": [ + "texts = [\"āļšāļđāļĢāļ“āļ°\", \"āļšāļđāļĢāļ“āļāļēāļĢ\", \"āļĄāļąāļ\", \"āļĄāļąāļ„\", \"āļĄāļĢāļĢāļ„\", \"āļĨāļąāļ\", \"āļĢāļąāļ\", \"āļĢāļąāļāļĐāđŒ\", \"\"]\n", + "for text in texts:\n", + " print(\n", + " \"{} - lk82: {} - udom83: {} - metasound: {}\".format(\n", + " text, lk82(text), udom83(text), metasound(text)\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spellchecking\n", + "\n", + "Default spellchecker uses [Peter Norvig's algorithm](http://www.norvig.com/spell-correct.html) together with word frequency from Thai National Corpus (TNC)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āđ€āļŦāļĨāļĩāļĒāļĄ', 'āđ€āļŦāļĨāļ·āļ­āļĄ']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import spell\n", + "\n", + "# list possible spellings\n", + "spell(\"āđ€āļŦāļĨāļ·āļĒāļĄ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āđ€āļŦāļĨāļĩāļĒāļĄ'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import correct\n", + "\n", + "# choose the most likely spelling\n", + "correct(\"āđ€āļŦāļĨāļ·āļĒāļĄ\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spellchecking - Custom dictionary and word frequency\n", + "\n", + "Custom dictionary can be provided when creating spellchecker." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['āđ€āļŦāļĨāļ·āļ­āļĄ']\n", + "āđ€āļŦāļĨāļ·āļ­āļĄ\n" + ] + } + ], + "source": [ + "from pythainlp.corpus import ttc # Thai Textbook Corpus\n", + "from pythainlp.spell import NorvigSpellChecker\n", + "\n", + "checker = NorvigSpellChecker(custom_dict=ttc.word_freqs())\n", + "print(checker.spell(\"āđ€āļŦāļĨāļ·āļĒāļĄ\"))\n", + "print(checker.correct(\"āđ€āļŦāļĨāļ·āļĒāļĄ\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļˆāļ°', 51681),\n", + " ('āđ€āļ›āđ‡āļ™', 51273),\n", + " ('āđ„āļ›', 46567),\n", + " ('āļāđ‡', 46409),\n", + " ('āđ„āļĄāđˆ', 45895),\n", + " ('āļĄāļĩ', 44899),\n", + " ('āđ„āļ”āđ‰', 44513),\n", + " ('āļ§āđˆāļē', 40290),\n", + " ('āđƒāļŦāđ‰', 38715)]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(checker.dictionary())[1:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also apply conditions and filter function to dictionary when creating spellchecker." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39977" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker = NorvigSpellChecker() # use default filter (remove any word with number or non-Thai character)\n", + "len(checker.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30379" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker = NorvigSpellChecker(min_freq=5, min_len=2, max_len=15)\n", + "len(checker.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76706" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker_no_filter = NorvigSpellChecker(dict_filter=None) # use no filter\n", + "len(checker_no_filter.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76700" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def remove_yamok(word):\n", + " return False if \"āđ†\" in word else True\n", + "\n", + "checker_custom_filter = NorvigSpellChecker(dict_filter=remove_yamok) # use custom filter\n", + "len(checker_custom_filter.dictionary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part-of-Speech Tagging" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļāļēāļĢ', 'FIXN'), ('āđ€āļ”āļīāļ™āļ—āļēāļ‡', 'VACT')]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.tag import pos_tag, pos_tag_sents\n", + "\n", + "pos_tag([\"āļāļēāļĢ\",\"āđ€āļ”āļīāļ™āļ—āļēāļ‡\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[('āļĢāļēāļŠāļāļīāļˆāļˆāļēāļ™āļļāđ€āļšāļāļĐāļē', 'NCMN'),\n", + " ('āđ€āļœāļĒāđāļžāļĢāđˆ', 'VACT'),\n", + " ('āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āđƒāļŦāđ‰', 'JSBR'),\n", + " (' ', 'PUNC'),\n", + " (\"'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'\", 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ', 'NCMN'),\n", + " ('āļāļ­āļ‡āļ—āļąāļžāļšāļ', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ', 'NCMN')],\n", + " [('āđāļĨāļ°', 'JCRG'),\n", + " ('āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡', 'VACT'),\n", + " ('āđƒāļŦāđ‰', 'JSBR'),\n", + " ('āđ€āļ›āđ‡āļ™', 'VSTA'),\n", + " ('āļ‚āđ‰āļēāļĢāļēāļŠāļāļēāļĢ', 'NCMN'),\n", + " ('āļžāļĨāđ€āļĢāļ·āļ­āļ™', 'NCMN'),\n", + " ('āļŠāļēāļĄāļąāļ', 'NCMN'),\n", + " ('āļ•āļģāđāļŦāļ™āđˆāļ‡', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " (\"'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'\", 'NCMN')]]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sents = [[\"āļĢāļēāļŠāļāļīāļˆāļˆāļēāļ™āļļāđ€āļšāļāļĐāļē\", \"āđ€āļœāļĒāđāļžāļĢāđˆ\", \"āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ\", \" \", \"āđƒāļŦāđ‰\",\n", + " \" \", \"'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'\", \" \", \"āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡\",\n", + " \" \", \"āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ\", \"āļāļ­āļ‡āļ—āļąāļžāļšāļ\", \" \", \"āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ\"],\n", + " [\"āđāļĨāļ°\",\"āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡\",\"āđƒāļŦāđ‰\", \"āđ€āļ›āđ‡āļ™\", \"āļ‚āđ‰āļēāļĢāļēāļŠāļāļēāļĢ\", \"āļžāļĨāđ€āļĢāļ·āļ­āļ™\", \"āļŠāļēāļĄāļąāļ\",\n", + " \"āļ•āļģāđāļŦāļ™āđˆāļ‡\", \" \", \"'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'\"]]\n", + "\n", + "pos_tag_sents(sents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Named-Entity Tagging\n", + "\n", + "The tagger use BIO scheme:\n", + "- B - beginning of entity\n", + "- I - inside entity\n", + "- O - outside entity" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļ§āļąāļ™āļ—āļĩāđˆ', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('15', 'NUM', 'B-DATE'),\n", + " (' ', 'PUNCT', 'I-DATE'),\n", + " ('āļ.āļĒ.', 'NOUN', 'I-DATE'),\n", + " (' ', 'PUNCT', 'I-DATE'),\n", + " ('61', 'NUM', 'I-DATE'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'),\n", + " ('āļĢāļ°āļšāļš', 'NOUN', 'O'),\n", + " ('āđ€āļ§āļĨāļē', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('14', 'NOUN', 'B-TIME'),\n", + " (':', 'PUNCT', 'I-TIME'),\n", + " ('49', 'NUM', 'I-TIME'),\n", + " (' ', 'PUNCT', 'I-TIME'),\n", + " ('āļ™.', 'NOUN', 'I-TIME')]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.tag.named_entity import ThaiNameTagger\n", + "\n", + "ner = ThaiNameTagger()\n", + "ner.get_ner(\"āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English\n", + "INFO:gensim.models.utils_any2vec:loading projection weights from /Users/arthit/pythainlp-data/thai2vec.bin\n", + "INFO:gensim.models.utils_any2vec:loaded (60001, 400) matrix from /Users/arthit/pythainlp-data/thai2vec.bin\n", + "/usr/local/lib/python3.7/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "0.99259853" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp.word_vector\n", + "\n", + "pythainlp.word_vector.similarity(\"āļ„āļ™\", \"āļĄāļ™āļļāļĐāļĒāđŒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "'āđāļĄāļ§'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.word_vector.doesnt_match([\"āļ„āļ™\", \"āļĄāļ™āļļāļĐāļĒāđŒ\", \"āļšāļļāļ„āļ„āļĨ\", \"āđ€āļˆāđ‰āļēāļŦāļ™āđ‰āļēāļ—āļĩāđˆ\", \"āđāļĄāļ§\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Number Spell Out" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļŦāļ™āļķāđˆāļ‡āļĨāđ‰āļēāļ™āļŠāļ­āļ‡āđāļŠāļ™āļŠāļēāļĄāļŦāļĄāļ·āđˆāļ™āļŠāļĩāđˆāļžāļąāļ™āļŦāđ‰āļēāļĢāđ‰āļ­āļĒāļŦāļāļŠāļīāļšāđ€āļˆāđ‡āļ”āļĨāđ‰āļēāļ™āđāļ›āļ”āđāļŠāļ™āđ€āļāđ‰āļēāļŦāļĄāļ·āđˆāļ™āļŦāļ™āļķāđˆāļ‡āļĢāđ‰āļ­āļĒāļĒāļĩāđˆāļŠāļīāļšāļŠāļēāļĄāļšāļēāļ—āļŠāļĩāđˆāļŠāļīāļšāļŦāđ‰āļēāļŠāļ•āļēāļ‡āļ„āđŒ'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import bahttext\n", + "\n", + "bahttext(1234567890123.45)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļŦāļ™āļķāđˆāļ‡āļšāļēāļ—āđ€āļāđ‰āļēāļŠāļīāļšāđ€āļ­āđ‡āļ”āļŠāļ•āļēāļ‡āļ„āđŒ'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bahttext(1.909)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/sentiment_analysis.ipynb b/notebooks/sentiment_analysis.ipynb index 58b659687..a1ab56694 100644 --- a/notebooks/sentiment_analysis.ipynb +++ b/notebooks/sentiment_analysis.ipynb @@ -47,12 +47,14 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", + "import re\n", + "\n", + "import emoji\n", "import numpy as np\n", + "import pandas as pd\n", + "\n", "from pythainlp import word_tokenize\n", "from tqdm import tqdm_notebook\n", - "import re\n", - "import emoji\n", "\n", "#viz\n", "import matplotlib.pyplot as plt\n", @@ -79,8 +81,8 @@ "def replace_rep(text):\n", " def _replace_rep(m):\n", " c,cc = m.groups()\n", - " return f'{c}xxrep'\n", - " re_rep = re.compile(r'(\\S)(\\1{2,})')\n", + " return f\"{c}xxrep\"\n", + " re_rep = re.compile(r\"(\\S)(\\1{2,})\")\n", " return re_rep.sub(_replace_rep, text)\n", "\n", "def ungroup_emoji(toks):\n", @@ -100,7 +102,7 @@ " res = replace_rep(res)\n", " \n", " #tokenize\n", - " res = [word for word in word_tokenize(res, engine='ulmfit') if word and not re.search(pattern=r\"\\s+\", string=word)]\n", + " res = [word for word in word_tokenize(res, engine=\"ulmfit\") if word and not re.search(pattern=r\"\\s+\", string=word)]\n", " \n", " #post rules\n", " res = ungroup_emoji(res)\n", @@ -123,15 +125,13 @@ }, "outputs": [], "source": [ - "with open('train.txt') as f:\n", + "with open(\"train.txt\") as f:\n", " texts = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "with open('train_label.txt') as f:\n", + "with open(\"train_label.txt\") as f:\n", " categories = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "all_df = pd.DataFrame({'category':categories, 'texts':texts})\n", + "all_df = pd.DataFrame({\"category\":categories, \"texts\":texts})\n", "all_df.shape" ] }, @@ -141,11 +141,10 @@ "metadata": {}, "outputs": [], "source": [ - "with open('test.txt') as f:\n", + "with open(\"test.txt\") as f:\n", " texts = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "test_df = pd.DataFrame({'category':'test', 'texts':texts})\n", + "test_df = pd.DataFrame({\"category\":\"test\", \"texts\":texts})\n", "test_df.shape" ] }, @@ -162,16 +161,16 @@ "metadata": {}, "outputs": [], "source": [ - "all_df = pd.read_csv('all_df.csv')\n", - "test_df = pd.read_csv('test_df.csv')\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", + "test_df = pd.read_csv(\"test_df.csv\")\n", "\n", - "all_df['processed'] = all_df.texts.map(lambda x: '|'.join(process_text(x)))\n", - "all_df['wc'] = all_df.processed.map(lambda x: len(x.split('|')))\n", - "all_df['uwc'] = all_df.processed.map(lambda x: len(set(x.split('|'))))\n", + "all_df[\"processed\"] = all_df.texts.map(lambda x: \"|\".join(process_text(x)))\n", + "all_df[\"wc\"] = all_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "all_df[\"uwc\"] = all_df.processed.map(lambda x: len(set(x.split(\"|\"))))\n", "\n", - "test_df['processed'] = test_df.texts.map(lambda x: '|'.join(process_text(x)))\n", - "test_df['wc'] = test_df.processed.map(lambda x: len(x.split('|')))\n", - "test_df['uwc'] = test_df.processed.map(lambda x: len(set(x.split('|'))))" + "test_df[\"processed\"] = test_df.texts.map(lambda x: \"|\".join(process_text(x)))\n", + "test_df[\"wc\"] = test_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "test_df[\"uwc\"] = test_df.processed.map(lambda x: len(set(x.split(\"|\"))))" ] }, { @@ -352,7 +351,7 @@ ], "source": [ "#prevalence\n", - "print(train_df['category'].value_counts() / train_df.shape[0])" + "print(train_df[\"category\"].value_counts() / train_df.shape[0])" ] }, { @@ -374,7 +373,7 @@ ], "source": [ "#prevalence\n", - "print(valid_df['category'].value_counts() / valid_df.shape[0])" + "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" ] }, { @@ -398,8 +397,8 @@ "outputs": [], "source": [ "#dependent variables\n", - "y_train = train_df['category']\n", - "y_valid = valid_df['category']" + "y_train = train_df[\"category\"]\n", + "y_valid = valid_df[\"category\"]" ] }, { @@ -424,10 +423,10 @@ "from sklearn.linear_model import LogisticRegression\n", "\n", "tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", - "tfidf_fit = tfidf.fit(all_df['texts'])\n", - "text_train = tfidf_fit.transform(train_df['texts'])\n", - "text_valid = tfidf_fit.transform(valid_df['texts'])\n", - "text_test = tfidf_fit.transform(test_df['texts'])\n", + "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", + "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", + "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", + "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", "text_train.shape, text_valid.shape" ] }, @@ -459,11 +458,11 @@ "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", - "scaler_fit = scaler.fit(all_df[['wc','uwc']].astype(float))\n", + "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", "print(scaler_fit.mean_, scaler_fit.var_)\n", - "num_train = scaler_fit.transform(train_df[['wc','uwc']].astype(float))\n", - "num_valid = scaler_fit.transform(valid_df[['wc','uwc']].astype(float))\n", - "num_test = scaler_fit.transform(test_df[['wc','uwc']].astype(float))\n", + "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", "num_train.shape, num_valid.shape" ] }, @@ -516,7 +515,7 @@ ], "source": [ "#fit logistic regression models\n", - "model = LogisticRegression(C=2., penalty='l2', solver='liblinear', dual=False, multi_class='ovr')\n", + "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", "model.fit(X_train,y_train)\n", "model.score(X_valid,y_valid)" ] @@ -537,14 +536,14 @@ "probs = model.predict_proba(X_valid)\n", "probs_df = pd.DataFrame(probs)\n", "probs_df.columns = model.classes_\n", - "probs_df['preds'] = model.predict(X_valid)\n", - "probs_df['category'] = valid_df.category\n", - "probs_df['texts'] = valid_df.texts\n", - "probs_df['processed'] = valid_df.processed\n", - "probs_df['wc'] = valid_df.wc\n", - "probs_df['uwc'] = valid_df.uwc\n", - "probs_df['hit'] = (probs_df.preds==probs_df.category)\n", - "probs_df.to_csv('probs_df_linear.csv',index=False)" + "probs_df[\"preds\"] = model.predict(X_valid)\n", + "probs_df[\"category\"] = valid_df.category\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "probs_df[\"processed\"] = valid_df.processed\n", + "probs_df[\"wc\"] = valid_df.wc\n", + "probs_df[\"uwc\"] = valid_df.uwc\n", + "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", + "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" ] }, { @@ -577,10 +576,10 @@ "\n", "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", "print(model.score(X_valid,y_valid))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", " xticklabels=model.classes_, yticklabels=model.classes_)\n", - "plt.ylabel('Actual')\n", - "plt.xlabel('Predicted')\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", "plt.show()" ] }, @@ -601,8 +600,8 @@ "from fastai.callbacks import CSVLogger, SaveModelCallback\n", "from pythainlp.ulmfit import *\n", "\n", - "model_path = 'wisesight_data/'\n", - "all_df = pd.read_csv('all_df.csv')\n", + "model_path = \"wisesight_data/\"\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" ] }, @@ -619,11 +618,11 @@ "metadata": {}, "outputs": [], "source": [ - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", "\n", - "data_lm = (TextList.from_df(all_df, model_path, cols='texts', processor=processor)\n", + "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", " .random_split_by_pct(valid_pct = 0.01, seed = 1412)\n", " .label_for_lm()\n", " .databunch(bs=48))\n", @@ -708,7 +707,7 @@ ], "source": [ "#train frozen\n", - "print('training frozen')\n", + "print(\"training frozen\")\n", "learn.freeze_to(-1)\n", "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" ] @@ -777,7 +776,7 @@ ], "source": [ "#train unfrozen\n", - "print('training unfrozen')\n", + "print(\"training unfrozen\")\n", "learn.unfreeze()\n", "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" ] @@ -789,7 +788,7 @@ "outputs": [], "source": [ "# learn.save('wisesight_lm')\n", - "learn.save_encoder('wisesight_enc')" + "learn.save_encoder(\"wisesight_enc\")" ] }, { @@ -814,17 +813,17 @@ ], "source": [ "#lm data\n", - "data_lm = load_data(model_path,'wisesight_lm.pkl')\n", + "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", "data_lm.sanity_check()\n", "\n", "#classification data\n", - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", "\n", - "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=['texts'], processor=processor),\n", - " valid=TextList.from_df(valid_df, model_path, cols=['texts'], processor=processor))\n", - " .label_from_df('category')\n", + "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", + " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", + " .label_from_df(\"category\")\n", " .databunch(bs=50)\n", " )\n", "data_cls.sanity_check()\n", @@ -844,7 +843,7 @@ "\n", "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", "#load pretrained finetuned model\n", - "learn.load_encoder('wisesight_enc')" + "learn.load_encoder(\"wisesight_enc\")" ] }, { @@ -909,7 +908,8 @@ "metadata": {}, "outputs": [], "source": [ - "learn.load('bestmodel');\n", + "learn.load(\"bestmodel\")\n", + "\n", "#get predictions\n", "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", "classes = learn.data.train_ds.classes\n", @@ -938,9 +938,9 @@ "source": [ "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", "probs_df = pd.DataFrame(to_df)\n", - "probs_df.columns = ['category','preds','loss'] + classes\n", - "probs_df['hit'] = (probs_df.category == probs_df.preds)\n", - "probs_df['texts'] = valid_df.texts\n", + "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", + "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", + "probs_df[\"texts\"] = valid_df.texts\n", "(y_true==preds).mean()" ] }, @@ -967,10 +967,10 @@ "import seaborn as sns\n", "\n", "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", " xticklabels=classes, yticklabels=classes)\n", - "plt.ylabel('Actual')\n", - "plt.xlabel('Predicted')\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", "plt.show()" ] } @@ -991,7 +991,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 7d873a9a1..9ab5ff1ad 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ ïŧŋ# -*- coding: utf-8 -*- -__version__ = 2.0 +__version__ = "2.0.1" thai_consonants = "āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ" # 44 chars thai_vowels = "āļĪāļĶāļ°\u0e31āļēāļģ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39āđ€āđāđ‚āđƒāđ„\u0e45\u0e47" # 19 @@ -25,8 +25,8 @@ from pythainlp.soundex import soundex -from pythainlp.spell import spell +from pythainlp.spell import correct, spell from pythainlp.tag import pos_tag -from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize +from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize, Tokenizer from pythainlp.transliterate import romanize, transliterate from pythainlp.util import collate, thai_strftime diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index f41744dac..855215bc0 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -113,7 +113,7 @@ def download(name: str, force: bool = False): data_json = data.json() if name in list(data_json.keys()): temp_name = data_json[name] - print("Download : " + name) + print("Download: " + name) if not db.search(temp.name == name): print(name + " " + temp_name["version"]) diff --git a/pythainlp/soundex/__init__.py b/pythainlp/soundex/__init__.py index 30cfcd0a7..fac5f978d 100644 --- a/pythainlp/soundex/__init__.py +++ b/pythainlp/soundex/__init__.py @@ -12,7 +12,7 @@ # [KSS97] https://linux.thai.net/~thep/soundex/soundex.html -def soundex(text, engine="udom83"): +def soundex(text: str, engine="udom83") -> str: """ Thai Soundex diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index f7b21a764..e0dee6d6b 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -21,7 +21,7 @@ _RE_3 = re.compile(r"[āđ‡āđāļšāđ†āļŊ]") -def lk82(text): +def lk82(text: str) -> str: """ LK82 - It's a Thai soundex rule. diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index c5f7f8233..6998f81a9 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -20,7 +20,7 @@ _C8 = "āļ§" # W -> 8 -def metasound(text, length=4): +def metasound(text: str, length: int = 4) -> str: """ Thai MetaSound diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py index bf7ec5bba..dce60feaa 100644 --- a/pythainlp/soundex/udom83.py +++ b/pythainlp/soundex/udom83.py @@ -29,7 +29,7 @@ ) -def udom83(text): +def udom83(text: str) -> str: """ Udom83 - It's a Thai soundex rule. diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index cfd06682b..c4b654f53 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -3,11 +3,14 @@ Spell checking """ -from .pn import correct as pn_correct -from .pn import spell as pn_spell +from typing import List +from .pn import DEFAULT_SPELL_CHECKER, NorvigSpellChecker -def spell(word, engine="pn"): +__all__ = ["DEFAULT_SPELL_CHECKER", "correct", "spell", "NorvigSpellChecker"] + + +def spell(word: str, engine="pn") -> List[str]: """ :param str word: word to check spelling :param str engine: @@ -15,10 +18,10 @@ def spell(word, engine="pn"): :return: list of words """ - return pn_spell(word) + return DEFAULT_SPELL_CHECKER.spell(word) -def correct(word, engine="pn"): +def correct(word: str, engine="pn") -> str: """ :param str word: word to correct spelling :param str engine: @@ -26,4 +29,4 @@ def correct(word, engine="pn"): :return: the corrected word """ - return pn_correct(word) + return DEFAULT_SPELL_CHECKER.correct(word) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 84def66f3..ddce3d5c7 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -7,26 +7,33 @@ Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html """ from collections import Counter +from typing import Callable, List, Set, Tuple from pythainlp import thai_letters from pythainlp.corpus import tnc -from pythainlp.util import is_thaichar +from pythainlp.util import isthaichar -def _no_filter(word): +def _no_filter(word: str) -> bool: return True -def _is_thai_and_not_num(word): +def _is_thai_and_not_num(word: str) -> bool: for ch in word: - if ch != "." and not is_thaichar(ch): + if ch != "." and not isthaichar(ch): return False if ch in "āđāđ‘āđ’āđ“āđ”āđ•āđ–āđ—āđ˜āđ™0123456789": return False return True -def _keep(word_freq, min_freq, min_len, max_len, dict_filter): +def _keep( + word_freq: int, + min_freq: int, + min_len: int, + max_len: int, + dict_filter: Callable[[str], bool], +): """ Keep only Thai words with at least min_freq frequency and has length between min_len and max_len characters @@ -41,7 +48,7 @@ def _keep(word_freq, min_freq, min_len, max_len, dict_filter): return dict_filter(word) -def _edits1(word): +def _edits1(word: str) -> Set[str]: """ Return a set of words with edit distance of 1 from the input word """ @@ -54,7 +61,7 @@ def _edits1(word): return set(deletes + transposes + replaces + inserts) -def _edits2(word): +def _edits2(word: str) -> Set[str]: """ Return a set of words with edit distance of 2 from the input word """ @@ -64,11 +71,11 @@ def _edits2(word): class NorvigSpellChecker: def __init__( self, - custom_dict=None, - min_freq=2, - min_len=2, - max_len=40, - dict_filter=_is_thai_and_not_num, + custom_dict: List[Tuple[str, int]] = None, + min_freq: int = 2, + min_len: int = 2, + max_len: int = 40, + dict_filter: Callable[[str], bool] = _is_thai_and_not_num, ): """ Initialize Peter Norvig's spell checker object @@ -97,13 +104,13 @@ def __init__( if self.__WORDS_TOTAL < 1: self.__WORDS_TOTAL = 0 - def dictionary(self): + def dictionary(self) -> List[Tuple[str, int]]: """ Return the spelling dictionary currently used by this spell checker """ return self.__WORDS.items() - def known(self, words): + def known(self, words: List[str]) -> List[str]: """ Return a list of given words that found in the spelling dictionary @@ -111,7 +118,7 @@ def known(self, words): """ return list(w for w in words if w in self.__WORDS) - def prob(self, word): + def prob(self, word: str) -> float: """ Return probability of an input word, according to the spelling dictionary @@ -119,7 +126,7 @@ def prob(self, word): """ return self.__WORDS[word] / self.__WORDS_TOTAL - def freq(self, word): + def freq(self, word: str) -> int: """ Return frequency of an input word, according to the spelling dictionary @@ -127,7 +134,7 @@ def freq(self, word): """ return self.__WORDS[word] - def spell(self, word): + def spell(self, word: str) -> List[str]: """ Return a list of possible words, according to edit distance of 1 and 2, sorted by frequency of word occurrance in the spelling dictionary @@ -147,7 +154,7 @@ def spell(self, word): return candidates - def correct(self, word): + def correct(self, word: str) -> str: """ Return the most possible word, using the probability from the spelling dictionary @@ -160,49 +167,3 @@ def correct(self, word): DEFAULT_SPELL_CHECKER = NorvigSpellChecker() - - -def dictionary(): - """ - Return the spelling dictionary currently used by this spell checker. - The spelling dictionary is based on words found in the Thai National Corpus. - """ - return DEFAULT_SPELL_CHECKER.dictionary() - - -def known(words): - """ - Return a list of given words that found in the spelling dictionary. - The spelling dictionary is based on words found in the Thai National Corpus. - - :param str words: A list of words to check if they are in the spelling dictionary - """ - return DEFAULT_SPELL_CHECKER.known(words) - - -def prob(word): - """ - Return probability of an input word, according to the Thai National Corpus - - :param str word: A word to check its probability of occurrence - """ - return DEFAULT_SPELL_CHECKER.prob(word) - - -def spell(word): - """ - Return a list of possible words, according to edit distance of 1 and 2, - sorted by probability of word occurrance in the Thai National Corpus. - - :param str word: A word to check its spelling - """ - return DEFAULT_SPELL_CHECKER.spell(word) - - -def correct(word): - """ - Return the most possible word, according to probability from the Thai National Corpus - - :param str word: A word to correct its spelling - """ - return DEFAULT_SPELL_CHECKER.correct(word) diff --git a/pythainlp/summarize/freq.py b/pythainlp/summarize/freq.py index c7bc25ff9..2dc7044fd 100644 --- a/pythainlp/summarize/freq.py +++ b/pythainlp/summarize/freq.py @@ -33,10 +33,10 @@ def __compute_frequencies(self, word_tokenized_sents): return word_freqs - def __rank(self, ranking, n): + def __rank(self, ranking, n: int): return nlargest(n, ranking, key=ranking.get) - def summarize(self, text, n, tokenizer): + def summarize(self, text: str, n: int, tokenizer: str): sents = sent_tokenize(text) word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents] self.__freq = self.__compute_frequencies(word_tokenized_sents) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 9b0232b78..985991415 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -4,31 +4,29 @@ such as its part of speech and class of named-entity. """ -__all__ = [ - "pos_tag", - "pos_tag_sents", - "tag_provinces" -] +from typing import List, Tuple + +__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"] from .locations import tag_provinces # tag map for orchid to Universal Dependencies -# from Korakot Chaovavanich +# from Korakot Chaovavanich _TAG_MAP_UD = { - #NOUN - "NOUN":"NOUN", - "NCMN":"NOUN", - "NTTL":"NOUN", - "CNIT":"NOUN", - "CLTV":"NOUN", - "CMTR":"NOUN", - "CFQC":"NOUN", - "CVBL":"NOUN", + # NOUN + "NOUN": "NOUN", + "NCMN": "NOUN", + "NTTL": "NOUN", + "CNIT": "NOUN", + "CLTV": "NOUN", + "CMTR": "NOUN", + "CFQC": "NOUN", + "CVBL": "NOUN", # VERB - "VACT":"VERB", - "VSTA":"VERB", - #PRON - "PRON":"PRON", - "NPRP":"PRON", + "VACT": "VERB", + "VSTA": "VERB", + # PRON + "PRON": "PRON", + "NPRP": "PRON", # ADJ "ADJ": "ADJ", "NONM": "ADJ", @@ -40,13 +38,13 @@ "ADVI": "ADV", "ADVP": "ADV", "ADVS": "ADV", - # INT + # INT "INT": "INTJ", # PRON - "PROPN":"PROPN", - "PPRS":"PROPN", - "PDMN":"PROPN", - "PNTR":"PROPN", + "PROPN": "PROPN", + "PPRS": "PROPN", + "PDMN": "PROPN", + "PNTR": "PROPN", # DET "DET": "DET", "DDAN": "DET", @@ -62,51 +60,69 @@ "NCNM": "NUM", "NLBL": "NUM", "DCNM": "NUM", - # AUX + # AUX "AUX": "AUX", "XVBM": "AUX", "XVAM": "AUX", "XVMM": "AUX", "XVBB": "AUX", "XVAE": "AUX", - # ADP + # ADP "ADP": "ADP", "RPRE": "ADP", # CCONJ - "CCONJ":"CCONJ", - "JCRG":"CCONJ", - # SCONJ - "SCONJ":"SCONJ", - "PREL":"SCONJ", - "JSBR":"SCONJ", - "JCMP":"SCONJ", + "CCONJ": "CCONJ", + "JCRG": "CCONJ", + # SCONJ + "SCONJ": "SCONJ", + "PREL": "SCONJ", + "JSBR": "SCONJ", + "JCMP": "SCONJ", # PART - "PART":"PART", - "FIXN":"PART", - "FIXV":"PART", - "EAFF":"PART", - "EITT":"PART", - "AITT":"PART", - "NEG":"PART", + "PART": "PART", + "FIXN": "PART", + "FIXV": "PART", + "EAFF": "PART", + "EITT": "PART", + "AITT": "PART", + "NEG": "PART", # PUNCT - "PUNCT":"PUNCT", - "PUNC":"PUNCT" + "PUNCT": "PUNCT", + "PUNC": "PUNCT", } -def _UD_Exception(w,tag): - if w=="āļāļēāļĢ" or w=="āļ„āļ§āļēāļĄ": - return "NOUN" - return tag -def _orchid_to_ud(tag): - _i=0 - temp=[] - while _i str: + if w == "āļāļēāļĢ" or w == "āļ„āļ§āļēāļĄ": + return "NOUN" + + return tag + + +def _orchid_to_ud(tag) -> List[Tuple[str, str]]: + _i = 0 + temp = [] + while _i < len(tag): + temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))) + _i += 1 + + return temp + + +def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]: + if not words: + return [] + + from artagger import Tagger -def pos_tag(words, engine="perceptron", corpus="orchid"): + words_ = Tagger().tag(" ".join(words)) + + return [(word.word, word.tag) for word in words_] + + +def pos_tag( + words: List[str], engine: str = "perceptron", corpus: str = "orchid" +) -> List[Tuple[str, str]]: """ Part of Speech tagging function. @@ -121,41 +137,36 @@ def pos_tag(words, engine="perceptron", corpus="orchid"): * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ - _corpus=corpus - _tag=[] - if corpus=="orchid_ud": - corpus="orchid" + _corpus = corpus + _tag = [] + if corpus == "orchid_ud": + corpus = "orchid" if not words: return [] if engine == "perceptron": from .perceptron import tag as tag_ elif engine == "artagger": - - def tag_(words, corpus=None): - if not words: - return [] - - from artagger import Tagger - words_ = Tagger().tag(" ".join(words)) - - return [(word.word, word.tag) for word in words_] - + tag_ = _artagger_tag else: # default, use "unigram" ("old") engine from .unigram import tag as tag_ - _tag= tag_(words, corpus=corpus) - if _corpus=="orchid_ud": - _tag=_orchid_to_ud(_tag) + _tag = tag_(words, corpus=corpus) + + if _corpus == "orchid_ud": + _tag = _orchid_to_ud(_tag) + return _tag -def pos_tag_sents(sentences, engine="perceptron", corpus="orchid"): +def pos_tag_sents( + sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid" +) -> List[List[Tuple[str, str]]]: """ Part of Speech tagging Sentence function. - :param list sentences: a list of tokenized sentences (a list of tokenized words in sentences) + :param list sentences: a list of lists of tokenized words :param str engine: - * unigram - unigram tagger + * unigram - unigram tagger * perceptron - perceptron tagger (default) * artagger - RDR POS tagger :param str corpus: diff --git a/pythainlp/tag/locations.py b/pythainlp/tag/locations.py index 01bf3060c..74fb96e5d 100644 --- a/pythainlp/tag/locations.py +++ b/pythainlp/tag/locations.py @@ -3,10 +3,12 @@ Recognizes locations in text """ +from typing import List, Tuple + from pythainlp.corpus import provinces -def tag_provinces(tokens): +def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]: """ Recognize Thailand provinces in text diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index a1236d171..dca5d18b8 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -5,20 +5,22 @@ __all__ = ["ThaiNameTagger"] +from typing import List, Tuple, Union + import sklearn_crfsuite from pythainlp.corpus import download, get_corpus_path, thai_stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize -from pythainlp.util import is_thaiword +from pythainlp.util import isthai _WORD_TOKENIZER = "newmm" # āļ•āļąāļ§āļ•āļąāļ”āļ„āļģ -def _is_stopword(word): # āđ€āļŠāđ‡āļ„āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļ„āļģāļŸāļļāđˆāļĄāđ€āļŸāļ·āļ­āļĒ +def _is_stopword(word: str) -> bool: # āđ€āļŠāđ‡āļ„āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļ„āļģāļŸāļļāđˆāļĄāđ€āļŸāļ·āļ­āļĒ return word in thai_stopwords() -def _doc2features(doc, i): +def _doc2features(doc, i) -> dict: word = doc[i][0] postag = doc[i][1] @@ -26,7 +28,7 @@ def _doc2features(doc, i): features = { "word.word": word, "word.stopword": _is_stopword(word), - "word.isthai": is_thaiword(word), + "word.isthai": isthai(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit": word.isdigit(), @@ -41,7 +43,7 @@ def _doc2features(doc, i): prev_features = { "word.prevword": prevword, "word.previsspace": prevword.isspace(), - "word.previsthai": is_thaiword(prevword), + "word.previsthai": isthai(prevword), "word.prevstopword": _is_stopword(prevword), "word.prevpostag": prevpostag, "word.prevwordisdigit": prevword.isdigit(), @@ -58,7 +60,7 @@ def _doc2features(doc, i): "word.nextword": nextword, "word.nextisspace": nextword.isspace(), "word.nextpostag": nextpostag, - "word.nextisthai": is_thaiword(nextword), + "word.nextisthai": isthai(nextword), "word.nextstopword": _is_stopword(nextword), "word.nextwordisdigit": nextword.isdigit(), } @@ -87,7 +89,9 @@ def __init__(self): model_filename=self.__data_path, ) - def get_ner(self, text, pos=True): + def get_ner( + self, text: str, pos: bool = True + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ Get named-entities in text @@ -101,10 +105,11 @@ def get_ner(self, text, pos=True): >>> ner = ThaiNameTagger() >>> ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.") [('āļ§āļąāļ™āļ—āļĩāđˆ', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'), - (' ', 'PUNCT', 'I-DATE'), ('āļ.āļĒ.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), - ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'), - ('āļĢāļ°āļšāļš', 'NOUN', 'O'), ('āđ€āļ§āļĨāļē', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), - ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), + (' ', 'PUNCT', 'I-DATE'), ('āļ.āļĒ.', 'NOUN', 'I-DATE'), + (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'), + (' ', 'PUNCT', 'O'), ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'), + ('āļĢāļ°āļšāļš', 'NOUN', 'O'), ('āđ€āļ§āļĨāļē', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), + ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('āļ™.', 'NOUN', 'I-TIME')] >>> ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.", pos=False) [('āļ§āļąāļ™āļ—āļĩāđˆ', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), @@ -113,7 +118,9 @@ def get_ner(self, text, pos=True): (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('āļ™.', 'I-TIME')] """ self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER) - self.__pos_tags = pos_tag(self.__tokens,engine="perceptron", corpus="orchid_ud") + self.__pos_tags = pos_tag( + self.__tokens, engine="perceptron", corpus="orchid_ud" + ) self.__x_test = self.__extract_features(self.__pos_tags) self.__y = self.crf.predict_single(self.__x_test) diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 4032df759..ccff12427 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -3,6 +3,7 @@ Perceptron Part-Of-Speech tagger """ import os +from typing import List, Tuple import dill from pythainlp.corpus import corpus_path @@ -22,127 +23,124 @@ def _load_tagger(filename): _PUD_TAGGER = _load_tagger(_PUD_DATA_FILENAME) -def tag(words, corpus="pud"): +def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: """ āļĢāļąāļšāļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āļ„āļ·āļ™āļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āđ€āļŠāđˆāļ™ [('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ...] """ if not words: return [] - # perceptron tagger cannot handle empty string - #words = [word.strip() for word in words if word.strip()] - if corpus == "orchid": tagger = _ORCHID_TAGGER - i=0 - while i': - words[i]="" - elif words[i]=='=': - words[i]="" - elif words[i]=='!': - words[i]="" - elif words[i]=='’': - words[i]="" - elif words[i]==':': - words[i]="" - elif words[i]=='*': - words[i]="" - elif words[i]==';': - words[i]="" - elif words[i]=='/': - words[i]="" - i+=1 - t2=tagger.tag(words) - t=[] - i=0 - while i" or word=='': - word=" " - elif word=="": - word="+" - elif word=="": - word="-" - elif word=="": - word="=" - elif word=="": - word="," - elif word=="": - word="$" - elif word=="": - word="." - elif word=="": - word="(" - elif word=="": - word=")" - elif word=="": - word='"' - elif word=="": - word='@' - elif word=="": - word='&' - elif word=="": - word='{' - elif word=="": - word='^' - elif word=="": - word='?' - elif word=="": - word='<' - elif word=="": - word='>' - elif word=="": - word='=' - elif word=="": - word='!' - elif word=="": - word='’' - elif word=="": - word=':' - elif word=="": - word='*' - elif word=="": - word=';' - elif word=="": - word='/' - t.append((word,tag)) - i+=1 - #t=temp + i = 0 + while i < len(words): + if words[i] == " ": + words[i] = "" + elif words[i] == "+": + words[i] = "" + elif words[i] == "-": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == ",": + words[i] = "" + elif words[i] == "$": + words[i] = "" + elif words[i] == ".": + words[i] = "" + elif words[i] == "(": + words[i] = "" + elif words[i] == ")": + words[i] = "" + elif words[i] == '"': + words[i] = "" + elif words[i] == "@": + words[i] = "" + elif words[i] == "&": + words[i] = "" + elif words[i] == "{": + words[i] = "" + elif words[i] == "^": + words[i] = "" + elif words[i] == "?": + words[i] = "" + elif words[i] == "<": + words[i] = "" + elif words[i] == ">": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == "!": + words[i] = "" + elif words[i] == "’": + words[i] = "" + elif words[i] == ":": + words[i] = "" + elif words[i] == "*": + words[i] = "" + elif words[i] == ";": + words[i] = "" + elif words[i] == "/": + words[i] = "" + i += 1 + t2 = tagger.tag(words) + t = [] + i = 0 + while i < len(t2): + word = t2[i][0] + tag = t2[i][1] + if word == "": + word = " " + elif word == "": + word = "+" + elif word == "": + word = "-" + elif word == "": + word = "=" + elif word == "": + word = "," + elif word == "": + word = "$" + elif word == "": + word = "." + elif word == "": + word = "(" + elif word == "": + word = ")" + elif word == "": + word = '"' + elif word == "": + word = "@" + elif word == "": + word = "&" + elif word == "": + word = "{" + elif word == "": + word = "^" + elif word == "": + word = "?" + elif word == "": + word = "<" + elif word == "": + word = ">" + elif word == "": + word = "=" + elif word == "": + word = "!" + elif word == "": + word = "’" + elif word == "": + word = ":" + elif word == "": + word = "*" + elif word == "": + word = ";" + elif word == "": + word = "/" + t.append((word, tag)) + i += 1 else: # default, use "pud" as a corpus tagger = _PUD_TAGGER - t=tagger.tag(words) + t = tagger.tag(words) + return t diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index 863323a1f..ece6e3028 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -4,6 +4,7 @@ """ import json import os +from typing import List, Tuple import dill import nltk.tag @@ -27,7 +28,7 @@ def _pud_tagger(): return model -def tag(words, corpus): +def tag(words: List[str], corpus: str) -> List[Tuple[str, str]]: """ āļĢāļąāļšāļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āļ„āļ·āļ™āļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āđ€āļŠāđˆāļ™ [('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ...] """ @@ -36,116 +37,116 @@ def tag(words, corpus): if corpus == "orchid": tagger = nltk.tag.UnigramTagger(model=_orchid_tagger()) - i=0 - while i': - words[i]="" - elif words[i]=='=': - words[i]="" - elif words[i]=='!': - words[i]="" - elif words[i]=='’': - words[i]="" - elif words[i]==':': - words[i]="" - elif words[i]=='*': - words[i]="" - elif words[i]==';': - words[i]="" - elif words[i]=='/': - words[i]="" - i+=1 - t=tagger.tag(words) - temp=[] - i=0 - while i": - word=" " - elif word=="": - word="+" - elif word=="": - word="-" - elif word=="": - word="=" - elif word=="": - word="," - elif word=="": - word="$" - elif word=="": - word="." - elif word=="": - word="(" - elif word=="": - word=")" - elif word=="": - word='"' - elif word=="": - word='@' - elif word=="": - word='&' - elif word=="": - word='{' - elif word=="": - word='^' - elif word=="": - word='?' - elif word=="": - word='<' - elif word=="": - word='>' - elif word=="": - word='=' - elif word=="": - word='!' - elif word=="": - word='’' - elif word=="": - word=':' - elif word=="": - word='*' - elif word=="": - word=';' - elif word=="": - word='/' - temp.append((word,tag)) - i+=1 - t=temp + i = 0 + while i < len(words): + if words[i] == " ": + words[i] = "" + elif words[i] == "+": + words[i] = "" + elif words[i] == "-": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == ",": + words[i] = "" + elif words[i] == "$": + words[i] = "" + elif words[i] == ".": + words[i] = "" + elif words[i] == "(": + words[i] = "" + elif words[i] == ")": + words[i] = "" + elif words[i] == '"': + words[i] = "" + elif words[i] == "@": + words[i] = "" + elif words[i] == "&": + words[i] = "" + elif words[i] == "{": + words[i] = "" + elif words[i] == "^": + words[i] = "" + elif words[i] == "?": + words[i] = "" + elif words[i] == "<": + words[i] = "" + elif words[i] == ">": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == "!": + words[i] = "" + elif words[i] == "’": + words[i] = "" + elif words[i] == ":": + words[i] = "" + elif words[i] == "*": + words[i] = "" + elif words[i] == ";": + words[i] = "" + elif words[i] == "/": + words[i] = "" + i += 1 + t = tagger.tag(words) + temp = [] + i = 0 + while i < len(t): + word = t[i][0] + tag = t[i][1] + if word == "": + word = " " + elif word == "": + word = "+" + elif word == "": + word = "-" + elif word == "": + word = "=" + elif word == "": + word = "," + elif word == "": + word = "$" + elif word == "": + word = "." + elif word == "": + word = "(" + elif word == "": + word = ")" + elif word == "": + word = '"' + elif word == "": + word = "@" + elif word == "": + word = "&" + elif word == "": + word = "{" + elif word == "": + word = "^" + elif word == "": + word = "?" + elif word == "": + word = "<" + elif word == "": + word = ">" + elif word == "": + word = "=" + elif word == "": + word = "!" + elif word == "": + word = "’" + elif word == "": + word = ":" + elif word == "": + word = "*" + elif word == "": + word = ";" + elif word == "": + word = "/" + temp.append((word, tag)) + i += 1 + t = temp else: tagger = _pud_tagger() - t=tagger.tag(words) + t = tagger.tag(words) return t diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index d3c9bb1d5..296460857 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -3,6 +3,8 @@ Thai tokenizers """ import re +from typing import Iterable, List, Union + from pythainlp.corpus import get_corpus, thai_syllables, thai_words from marisa_trie import Trie @@ -11,11 +13,13 @@ FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) -def word_tokenize(text, engine="newmm", whitespaces=True): +def word_tokenize( + text: str, engine: str = "newmm", whitespaces: bool = True +) -> List[str]: """ :param str text: text to be tokenized :param str engine: tokenizer to be used - :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai + :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai :Parameters for engine: * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster * longest - dictionary-based, Longest Matching @@ -60,7 +64,9 @@ def segment(text): return segment(text) -def dict_word_tokenize(text, custom_dict, engine="newmm"): +def dict_word_tokenize( + text: str, custom_dict: Trie, engine: str = "newmm" +) -> List[str]: """ :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized @@ -90,7 +96,7 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"): return segment(text, custom_dict) -def sent_tokenize(text, engine="whitespace+newline"): +def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: """ This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. @@ -106,14 +112,14 @@ def sent_tokenize(text, engine="whitespace+newline"): sentences = [] if engine == "whitespace": - sentences = re.split(r' +', text, re.U) + sentences = re.split(r" +", text, re.U) else: # default, use whitespace + newline sentences = text.split() return sentences -def subword_tokenize(text, engine="tcc"): +def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: """ :param str text: text to be tokenized :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. @@ -127,7 +133,7 @@ def subword_tokenize(text, engine="tcc"): return tcc(text) -def syllable_tokenize(text): +def syllable_tokenize(text: str) -> List[str]: """ :param str text: input string to be tokenized @@ -147,7 +153,7 @@ def syllable_tokenize(text): return tokens -def dict_trie(dict_source): +def dict_trie(dict_source: Union[str, Iterable]) -> Trie: """ Create a dict trie which will be used for word_tokenize() function. For more information on the trie data structure, @@ -162,17 +168,19 @@ def dict_trie(dict_source): with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() return Trie(_vocabs) - elif isinstance(dict_source, (list, tuple, set, frozenset)): + elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs return Trie(dict_source) else: raise TypeError( - "Type of dict_source must be either str (path to source file) or collections" + "Type of dict_source must be either str (path to source file) or iterable" ) class Tokenizer: - def __init__(self, custom_dict=None,tokenize_engine="newmm"): + def __init__( + self, custom_dict: Union[str, Iterable] = None, tokenize_engine: str = "newmm" + ): """ Initialize tokenizer object @@ -180,20 +188,24 @@ def __init__(self, custom_dict=None,tokenize_engine="newmm"): :param str tokenize_engine: choose between different options of engine to token (newmm, mm, longest) """ self.__trie_dict = None - self.word_engine=tokenize_engine + self.word_engine = tokenize_engine if custom_dict: self.__trie_dict = dict_trie(custom_dict) else: self.__trie_dict = dict_trie(thai_words()) - def word_tokenize(self, text): + + def word_tokenize(self, text: str) -> List[str]: """ :param str text: text to be tokenized :return: list of words, tokenized from the text """ - return dict_word_tokenize(text,custom_dict=self.__trie_dict,engine=self.word_engine) - def set_tokenize_engine(self,name_engine): + return dict_word_tokenize( + text, custom_dict=self.__trie_dict, engine=self.word_engine + ) + + def set_tokenize_engine(self, name_engine: str) -> None: """ :param str name_engine: choose between different options of engine to token (newmm, mm, longest) """ - self.word_engine=name_engine \ No newline at end of file + self.word_engine = name_engine diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 395e76583..a3844c2f3 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -3,8 +3,10 @@ Wrapper for deepcut Thai word segmentation """ +from typing import List + import deepcut -def segment(text): +def segment(text: str) -> List[str]: return deepcut.tokenize(text) diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index dbe04122a..986878001 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -20,7 +20,7 @@ _UV2 = "[" + "".join(["āļą", "āļ·"]) + "]" -def etcc(text): +def etcc(text: str) -> str: """ Enhanced Thai Character Cluster (ETCC) diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 33ff1fa0a..83ce495a1 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -39,14 +39,13 @@ class LongestMatchTokenizer(object): def __init__(self, trie): self.__trie = trie - def __search_nonthai(self, text): + def __search_nonthai(self, text: str): match = _RE_NONTHAI.search(text) if match.group(0): return match.group(0).lower() - else: - return None + return None - def __is_next_word_valid(self, text, begin_pos): + def __is_next_word_valid(self, text: str, begin_pos: int) -> bool: len_text = len(text) text = text[begin_pos:len_text].strip() @@ -63,7 +62,7 @@ def __is_next_word_valid(self, text, begin_pos): return False - def __longest_matching(self, text, begin_pos): + def __longest_matching(self, text: str, begin_pos: int): len_text = len(text) text = text[begin_pos:len_text] @@ -94,7 +93,7 @@ def __longest_matching(self, text, begin_pos): else: return "" - def __segment_text(self, text): + def __segment_text(self, text: str): if not text: return [] diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 17815fd9f..066ff1017 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -9,6 +9,7 @@ import re from collections import defaultdict from heapq import heappop, heappush # for priority queue +from typing import List from pythainlp.tokenize import DEFAULT_DICT_TRIE @@ -38,7 +39,7 @@ def bfs_paths_graph(graph, start, goal): queue.append((next, path + [next])) -def onecut(text, trie): +def onecut(text: str, trie): graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆāļ•āļąāļ” āļ•āđ‰āļ­āļ‡āļ•āļĢāļ‡āļāļąāļš tcc @@ -90,7 +91,7 @@ def onecut(text, trie): # āļŠāđˆāļ§āļĒāđƒāļŦāđ‰āđ„āļĄāđˆāļ•āđ‰āļ­āļ‡āļžāļīāļĄāļžāđŒāļĒāļēāļ§āđ† -def segment(text, trie=None): +def segment(text: str, trie=None) -> List[str]: if not text: return [] diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 23b7b38e4..33fc0aabc 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -3,11 +3,12 @@ Wrapper for ICU word segmentation """ import re +from typing import List from icu import BreakIterator, Locale -def _gen_words(text): +def _gen_words(text: str) -> str: bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() @@ -16,7 +17,7 @@ def _gen_words(text): p = q -def segment(text): +def segment(text: str) -> List[str]: if not text: return [] diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index b50bdb24a..8ef125217 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -10,6 +10,7 @@ - Python code: Korakot Chaovavanich """ import re +from typing import List, Set RE_TCC = ( """\ @@ -47,9 +48,9 @@ PAT_TCC = re.compile("|".join(RE_TCC)) -def tcc_gen(w): +def tcc_gen(w: str) -> str: if not w: - return '' + return "" p = 0 while p < len(w): @@ -62,7 +63,7 @@ def tcc_gen(w): p += n -def tcc_pos(text): +def tcc_pos(text: str) -> Set[int]: if not text: return set() @@ -75,8 +76,5 @@ def tcc_pos(text): return p_set -def tcc(text, sep="/"): - if not text: - return "" - - return sep.join(tcc_gen(text)) +def tcc(text: str) -> List[str]: + return list(tcc_gen(text)) diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index e2487e582..5f7a5a5cb 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -5,19 +5,20 @@ For text processing and text conversion, see pythainlp.util """ import os -import sys + import pythainlp PYTHAINLP_DATA_DIR = "pythainlp-data" -def get_full_data_path(path): + +def get_full_data_path(path: str) -> str: """ Get filename/path of a dataset, return full path of that filename/path """ return os.path.join(get_pythainlp_data_path(), path) -def get_pythainlp_data_path(): +def get_pythainlp_data_path() -> str: """ Return full path where PyThaiNLP keeps its (downloaded) data """ @@ -27,7 +28,7 @@ def get_pythainlp_data_path(): return path -def get_pythainlp_path(): +def get_pythainlp_path() -> str: """ Return full path of PyThaiNLP code """ diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index df96b0360..91435cc54 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -3,15 +3,15 @@ from pythainlp.tokenize import word_tokenize -# āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ -def romanize(text, engine="royin"): +def romanize(text: str, engine: str = "royin") -> str: """ + āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ :param str text: Thai text to be romanized :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). :return: English (more or less) text that spells out how the Thai text should read. """ - if isinstance(text,str)==False: + if not isinstance(text, str) or not text: return "" if engine == "thai2rom": @@ -21,22 +21,20 @@ def romanize(text, engine="royin"): else: # use default engine "royin" from .royin import romanize - try: - words = word_tokenize(text) - romanized_words = [romanize(word) for word in words] - except: - romanized_words =[romanize(text)] + words = word_tokenize(text) + romanized_words = [romanize(word) for word in words] + return "".join(romanized_words) -def transliterate(text, engine="ipa"): +def transliterate(text: str, engine: str = "ipa") -> str: """ :param str text: Thai text to be transliterated :param str engine: 'ipa' (default) or 'pyicu'. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. """ - if not text: + if not isinstance(text, str) or not text: return "" if engine == "pyicu": diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py index 5fe18d24d..be7c1e1c6 100644 --- a/pythainlp/transliterate/ipa.py +++ b/pythainlp/transliterate/ipa.py @@ -7,7 +7,7 @@ _EPI_THA = epitran.Epitran("tha-Thai") -def transliterate(text): +def transliterate(text: str) -> str: return _EPI_THA.transliterate(text) diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py index e34be0e16..5e4a755aa 100644 --- a/pythainlp/transliterate/pyicu.py +++ b/pythainlp/transliterate/pyicu.py @@ -6,7 +6,7 @@ # āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ -def transliterate(text): +def transliterate(text: str) -> str: """ āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ āļĢāļąāļšāļ„āđˆāļē ''str'' āļ‚āđ‰āļ­āļ„āļ§āļēāļĄ āļ„āļ·āļ™āļ„āđˆāļē ''str'' āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ """ diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py index 62e44783b..d6f6f71c8 100644 --- a/pythainlp/transliterate/royin.py +++ b/pythainlp/transliterate/royin.py @@ -117,20 +117,20 @@ ) -def _normalize(text): +def _normalize(text: str) -> str: """āļ•āļąāļ”āļ­āļąāļāļĐāļĢāļ—āļĩāđˆāđ„āļĄāđˆāļ­āļ­āļāđ€āļŠāļĩāļĒāļ‡ (āļāļēāļĢāļąāļ™āļ•āđŒ āđ„āļ›āļĒāļēāļĨāļ™āđ‰āļ­āļĒ āđ„āļĄāđ‰āļĒāļĄāļ*) āđāļĨāļ°āļ§āļĢāļĢāļ“āļĒāļļāļāļ•āđŒāļ—āļīāđ‰āļ‡""" return _RE_NORMALIZE.sub("", text) -def _replace_vowels(word): +def _replace_vowels(word: str) -> str: for vowel in _VOWELS: word = re.sub(vowel[0], vowel[1], word) return word -def _replace_consonants(word, res): - if res is None: +def _replace_consonants(word: str, res: str) -> str: + if not res: pass elif len(res) == 1: word = word.replace(res[0], _CONSONANTS[res[0]][0]) @@ -162,9 +162,10 @@ def _replace_consonants(word, res): return word -def romanize(word): - if isinstance(word,str)==False: +def romanize(word: str) -> str: + if not isinstance(word, str) or not word: return "" + word2 = _replace_vowels(_normalize(word)) res = _RE_CONSONANT.findall(word2) @@ -175,5 +176,5 @@ def romanize(word): word2 = "".join(word2) word2 = _replace_consonants(word2, res) - + return word2 \ No newline at end of file diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py index 49a498d83..1dc5a5267 100644 --- a/pythainlp/transliterate/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -157,5 +157,5 @@ def romanize(self, text): _THAI_TO_ROM = ThaiTransliterator() -def romanize(text): +def romanize(text: str) -> str: return _THAI_TO_ROM.romanize(text) diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index ab56c81ce..00c9f8891 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -35,8 +35,9 @@ _MODEL_NAME_LSTM = "wiki_lm_lstm" _ITOS_NAME_LSTM = "wiki_itos_lstm" + # Download pretrained models -def _get_path(fname): +def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name @@ -56,7 +57,7 @@ class ThaiTokenizer(BaseTokenizer): https://docs.fast.ai/text.transform#BaseTokenizer """ - def __init__(self, lang = "th"): + def __init__(self, lang="th"): self.lang = lang def tokenizer(self, t): @@ -94,6 +95,7 @@ def rm_brackets(t): new_line = re.sub(r"\[\]", "", new_line) return new_line + def ungroup_emoji(toks): "Ungroup emojis" res = [] @@ -105,6 +107,7 @@ def ungroup_emoji(toks): res.append(tok) return res + def lowercase_all(toks): "lowercase all English words" return [tok.lower() for tok in toks] @@ -112,17 +115,26 @@ def lowercase_all(toks): # Pretrained paths # TODO: Let the user decide if they like to download (at setup?) -_THWIKI_LSTM = dict(wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM)) +_THWIKI_LSTM = dict( + wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM) +) # Preprocessing rules for Thai text -pre_rules_th = [fix_html, replace_rep_after, normalize_char_order, - spec_add_spaces, rm_useless_spaces, rm_useless_newlines, rm_brackets] +pre_rules_th = [ + fix_html, + replace_rep_after, + normalize_char_order, + spec_add_spaces, + rm_useless_spaces, + rm_useless_newlines, + rm_brackets, +] post_rules_th = [replace_all_caps, ungroup_emoji, lowercase_all] _tokenizer = ThaiTokenizer() -def document_vector(text, learn, data, agg='mean'): +def document_vector(text, learn, data, agg="mean"): """ :meth: `document_vector` get document vector using fastai language model and data bunch :param str text: text to extract embeddings @@ -131,18 +143,18 @@ def document_vector(text, learn, data, agg='mean'): :param agg: how to aggregate embeddings :return: `numpy.array` of document vector sized 400 based on the encoder of the model """ - + s = _tokenizer.tokenizer(text) t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(device) m = learn.model[0].encoder.to(device) res = m(t).cpu().detach().numpy() - if agg == 'mean': + if agg == "mean": res = res.mean(0) - elif agg == 'sum': + elif agg == "sum": res = res.sum(0) else: - raise ValueError('Aggregate by mean or sum') - return(res) + raise ValueError("Aggregate by mean or sum") + return res def merge_wgts(em_sz, wgts, itos_pre, itos_new): diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index b7e194436..6a4ff0ce6 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -11,9 +11,9 @@ "digit_to_text", "eng_to_thai", "find_keyword", - "is_thai", - "is_thaichar", - "is_thaiword", + "countthai", + "isthai", + "isthaichar", "normalize", "now_reign_year", "num_to_thaiword", @@ -42,6 +42,6 @@ from .keywords import find_keyword, rank from .normalize import deletetone, normalize from .numtoword import bahttext, num_to_thaiword -from .thai import is_thai, is_thaichar, is_thaiword +from .thai import countthai, isthai, isthaichar +from .thaiwordcheck import thaicheck from .wordtonum import thaiword_to_num -from .thaiwordcheck import thaicheck \ No newline at end of file diff --git a/pythainlp/util/collate.py b/pythainlp/util/collate.py index bc35c2fe9..ffaff4998 100644 --- a/pythainlp/util/collate.py +++ b/pythainlp/util/collate.py @@ -4,25 +4,27 @@ Simple implementation using regular expressions """ import re +from typing import Iterable, List _RE_TONE = re.compile(r"[āđ‡-āđŒ]") _RE_LV_C = re.compile(r"([āđ€-āđ„])([āļ-āļŪ])") -def _thkey(word): +def _thkey(word: str) -> str: cv = _RE_TONE.sub("", word) # remove tone cv = _RE_LV_C.sub("\\2\\1", cv) # switch lead vowel tone = _RE_TONE.sub(" ", word) # just tone return cv + tone -def collate(data): +def collate(data: Iterable, reverse: bool = False) -> List[str]: """ - :param list data: a list of strings + :param list data: a list of strings to be sorted + :param bool reverse: reverse flag, set to get the result in descending order :return: a list of strings, sorted alphabetically, according to Thai rules **Example**:: >>> from pythainlp.util import * >>> collate(['āđ„āļāđˆ', 'āđ€āļ›āđ‡āļ”', 'āļŦāļĄāļđ', 'āļ§āļąāļ§']) ['āđ„āļāđˆ', 'āđ€āļ›āđ‡āļ”', 'āļ§āļąāļ§', 'āļŦāļĄāļđ'] """ - return sorted(data, key=_thkey) + return sorted(data, key=_thkey, reverse=reverse) diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index f2d2ee15b..903e42fd4 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -63,7 +63,7 @@ # Conversion support for thai_strftime() -def _thai_strftime(datetime, fmt_c): +def _thai_strftime(datetime, fmt_c: str) -> str: text = "" if fmt_c == "a": # abbreviated weekday text = thai_abbr_weekdays[datetime.weekday()] @@ -73,7 +73,7 @@ def _thai_strftime(datetime, fmt_c): text = thai_abbr_months[datetime.month - 1] elif fmt_c == "B": # full month text = thai_full_months[datetime.month - 1] - elif fmt_c == "y": # # year without century + elif fmt_c == "y": # year without century text = str(datetime.year + 543)[2:4] elif fmt_c == "Y": # year with century text = str(datetime.year + 543) @@ -97,7 +97,7 @@ def _thai_strftime(datetime, fmt_c): return text -def thai_strftime(datetime, fmt, thaidigit=False): +def thai_strftime(datetime, fmt: str, thaidigit=False) -> str: """ Thai date and time string formatter @@ -126,7 +126,7 @@ def thai_strftime(datetime, fmt, thaidigit=False): If supported, we can just locale.setlocale(locale.LC_TIME, "th_TH") and then use native datetime.strftime(). - :return: Date and time spelled out in text, with month in Thai name and year in Thai Buddhist Era (BE). + :return: Date and time spelled out, with day and month names in Thai and year in Thai Buddhist Era (BE). """ thaidate_parts = [] diff --git a/pythainlp/util/digitconv.py b/pythainlp/util/digitconv.py index 16e634833..3982168d6 100644 --- a/pythainlp/util/digitconv.py +++ b/pythainlp/util/digitconv.py @@ -56,7 +56,7 @@ } -def thai_digit_to_arabic_digit(text): +def thai_digit_to_arabic_digit(text: str) -> str: """ :param str text: Text with Thai digits such as 'āđ‘', 'āđ’', 'āđ“' :return: Text with Thai digits being converted to Arabic digits such as '1', '2', '3' @@ -74,7 +74,7 @@ def thai_digit_to_arabic_digit(text): return "".join(newtext) -def arabic_digit_to_thai_digit(text): +def arabic_digit_to_thai_digit(text: str) -> str: """ :param str text: Text with Arabic digits such as '1', '2', '3' :return: Text with Arabic digits being converted to Thai digits such as 'āđ‘', 'āđ’', 'āđ“' @@ -92,7 +92,7 @@ def arabic_digit_to_thai_digit(text): return "".join(newtext) -def digit_to_text(text): +def digit_to_text(text: str) -> str: """ :param str text: Text with digits such as '1', '2', 'āđ“', 'āđ”' :return: Text with digits being spelled out in Thai @@ -113,7 +113,7 @@ def digit_to_text(text): return "".join(newtext) -def text_to_arabic_digit(text): +def text_to_arabic_digit(text: str) -> str: """ :param text: A digit spelled out in Thai :return: An Arabic digit such as '1', '2', '3' @@ -124,7 +124,7 @@ def text_to_arabic_digit(text): return _spell_digit[text] -def text_to_thai_digit(text): +def text_to_thai_digit(text: str) -> str: """ :param text: A digit spelled out in Thai :return: A Thai digit such as 'āđ‘', 'āđ’', 'āđ“' diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 8fb4abc6e..ad156715d 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -101,7 +101,7 @@ TH_EN_KEYB_PAIRS = {v: k for k, v in EN_TH_KEYB_PAIRS.items()} -def eng_to_thai(text): +def eng_to_thai(text: str) -> str: """ Correct text in one language that is incorrectly-typed with a keyboard layout in another language. (type Thai with English keyboard) @@ -113,7 +113,7 @@ def eng_to_thai(text): ) -def thai_to_eng(text): +def thai_to_eng(text: str) -> str: """ Correct text in one language that is incorrectly-typed with a keyboard layout in another language. (type Thai with English keyboard) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 66c179fb9..3e05a2c69 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -44,7 +44,7 @@ ] # āđ€āļāđ‡āļšāļžāļ§āļ āļžāļīāļĄāļžāđŒāļĨāļģāļ”āļąāļšāļœāļīāļ”āļŦāļĢāļ·āļ­āļœāļīāļ”āđāļ›āđ‰āļ™āđāļ•āđˆāļāļĨāļąāļšāđāļŠāļ”āļ‡āļœāļĨāļ–āļđāļāļ•āđ‰āļ­āļ‡ āđƒāļŦāđ‰āđ„āļ›āđ€āļ›āđ‡āļ™āđāļ›āđ‰āļ™āļ—āļĩāđˆāļ–āļđāļāļ•āđ‰āļ­āļ‡ āđ€āļŠāđˆāļ™ āđ€ + āđ€ āđ„āļ›āđ€āļ›āđ‡āļ™ āđ -def normalize(text): +def normalize(text: str) -> str: """ Thai text normalize @@ -61,7 +61,7 @@ def normalize(text): return text -def deletetone(text): +def deletetone(text: str) -> str: """ Remove tonemarks diff --git a/pythainlp/util/numtoword.py b/pythainlp/util/numtoword.py index 394984d70..68519cb79 100644 --- a/pythainlp/util/numtoword.py +++ b/pythainlp/util/numtoword.py @@ -10,7 +10,7 @@ __all__ = ["bahttext", "num_to_thaiword"] -def bahttext(number): +def bahttext(number: float) -> str: """ Converts a number to Thai text and adds a suffix of "Baht" currency. Precision will be fixed at two decimal places (0.00) to fits "Satang" unit. @@ -41,9 +41,9 @@ def bahttext(number): return ret -def num_to_thaiword(number): +def num_to_thaiword(number: int) -> str: """ - :param float number: a float number (with decimals) indicating a quantity + :param int number: a float number (with decimals) indicating a quantity :return: a text that indicates the full amount in word form, properly ending each digit with the right term. """ ret = "" diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index f6b8f3d58..70e5a9d15 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -2,11 +2,15 @@ """ Check if it is Thai text """ +import string +_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation -def is_thaichar(ch): # āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ + +def isthaichar(ch: str) -> bool: """ - Check if character is Thai + Check if a character is Thai + āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ :param str ch: input character :return: True or False @@ -17,45 +21,44 @@ def is_thaichar(ch): # āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄ return False -def is_thaiword(word): # āđ€āļ›āđ‡āļ™āļ„āļģāļ—āļĩāđˆāļĄāļĩāđāļ•āđˆāļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ +def isthai(word: str, ignore_chars: str = ".") -> bool: """ Check if all character is Thai + āđ€āļ›āđ‡āļ™āļ„āļģāļ—āļĩāđˆāļĄāļĩāđāļ•āđˆāļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ :param str word: input text + :param str ignore_chars: characters to be ignored (i.e. will be considered as Thai) :return: True or False """ + if not ignore_chars: + ignore_chars = "" + for ch in word: - if ch != "." and not is_thaichar(ch): + if ch not in ignore_chars and not isthaichar(ch): return False return True -def is_thai(text, check_all=False): +def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """ - :param str text: input string or list of strings - :param bool check_all: checks all character or not - - :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. + :param str text: input text + :return: float, proportion of characters in the text that is Thai character """ - isthais = [] - num_isthai = 0 + if not text: + return 0 + + if not ignore_chars: + ignore_chars = "" + + num_thai = 0 + num_ignore = 0 for ch in text: - ch_val = ord(ch) - if ch_val >= 3584 and ch_val <= 3711: - num_isthai += 1 - if check_all: - isthais.append(True) - else: - if check_all: - isthais.append(False) - thai_percent = (num_isthai / len(text)) * 100 - - if check_all: - chars = list(text) - isthai_pairs = tuple(zip(chars, isthais)) - data = {"thai": thai_percent, "check_all": isthai_pairs} - else: - data = {"thai": thai_percent} - - return data + if ch in ignore_chars: + num_ignore += 1 + elif isthaichar(ch): + num_thai += 1 + + num_count = len(text) - num_ignore + + return (num_thai / num_count) * 100 diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py index d2a036370..7237d2db6 100644 --- a/pythainlp/util/thaiwordcheck.py +++ b/pythainlp/util/thaiwordcheck.py @@ -1,51 +1,76 @@ # -*- coding: utf-8 -*- -''' -From https://github.com/wannaphongcom/open-thai-nlp-document/blob/master/check_thai_word.md -''' +""" +From +https://github.com/wannaphongcom/open-thai-nlp-document/blob/master/check_thai_word.md +""" import re -def _check1(word): # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļŠāļ°āļāļ”āļ§āđˆāļēāļ•āļĢāļ‡āļ•āļēāļĄāļĄāļēāļ•āļĢāļēāđ„āļŦāļĄ - if word in ['āļ','āļ”','āļš','āļ™','āļ‡','āļĄ','āļĒ','āļ§']: - return True - else: - return False -def _check2(word): # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļāļēāļĢāļąāļ™āļ•āđŒ āļ–āđ‰āļēāļĄāļĩ āđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - if 'āđŒ' in word: - return False - else: - return True -def _check3(word): - if word in list("āļ†āļ“āļŒāļŽāļāļāļ‘āļ’āļ˜āļĻāļĐāļŽ"): # āļ–āđ‰āļēāļĄāļĩ āđāļŠāļ”āļ‡āļ§āđˆāļēāđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - return False - else: - return True -def thaicheck(word): - """ - Check is Thai Word - - :param str word: word - :return: True or False - """ - pattern = re.compile(r"[āļ-āļŽāļŪ]",re.U) # āļŠāļģāļŦāļĢāļąāļšāļ•āļĢāļ§āļˆāļŠāļ­āļšāļžāļĒāļąāļāļŠāļ™āļ° - res = re.findall(pattern,word) # āļ”āļķāļ‡āļžāļĒāļąāļāļŠāļ™āļ°āļ—āļąāļąāđ‰āļ‡āļŦāļĄāļ”āļ­āļ­āļāļĄāļē - if res==[]: - return False - elif _check1(res[len(res)-1]) or len(res)==1: - if _check2(word): - word2=list(word) - i=0 - thai=True - if word in ['āļ†āđˆāļē','āđ€āļ†āļĩāđˆāļĒāļ™','āļĻāļķāļ','āļĻāļ­āļ','āđ€āļĻāļīāļ','āđ€āļĻāļĢāđ‰āļē','āļ˜','āļ“','āļŊāļžāļ“āļŊ','āđƒāļŦāļāđˆ','āļŦāļāđ‰āļē','āļ„āļ§āļēāļĒ','āļ„āļ§āļēāļĄ','āļāļĢāļīāđˆāļ‡āđ€āļāļĢāļ‡','āļœāļĨāļī']: # āļ‚āđ‰āļ­āļĒāļāđ€āļ§āđ‰āļ™ āļ„āļģāđ€āļŦāļĨāđˆāļēāļ™āļĩāđ‰āđ€āļ›āđ‡āļ™āļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - return True - while i bool: # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļŠāļ°āļāļ”āļ§āđˆāļēāļ•āļĢāļ‡āļ•āļēāļĄāļĄāļēāļ•āļĢāļēāđ„āļŦāļĄ + if word in ["āļ", "āļ”", "āļš", "āļ™", "āļ‡", "āļĄ", "āļĒ", "āļ§"]: + return True + return False + + +def _check2(word: str) -> bool: # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļāļēāļĢāļąāļ™āļ•āđŒ āļ–āđ‰āļēāļĄāļĩ āđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + if "āđŒ" in word: + return False + return True + + +def _check3(word: str) -> bool: + if word in list("āļ†āļ“āļŒāļŽāļāļāļ‘āļ’āļ˜āļĻāļĐāļŽ"): # āļ–āđ‰āļēāļĄāļĩ āđāļŠāļ”āļ‡āļ§āđˆāļēāđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + return False + return True + + +def thaicheck(word: str) -> bool: + """ + Check if a word is an "authentic Thai word" + + :param str word: word + :return: True or False + """ + pattern = re.compile(r"[āļ-āļŽāļŪ]", re.U) # āļŠāļģāļŦāļĢāļąāļšāļ•āļĢāļ§āļˆāļŠāļ­āļšāļžāļĒāļąāļāļŠāļ™āļ° + res = re.findall(pattern, word) # āļ”āļķāļ‡āļžāļĒāļąāļāļŠāļ™āļ°āļ—āļąāļąāđ‰āļ‡āļŦāļĄāļ”āļ­āļ­āļāļĄāļē + + if res == []: + return False + + if _check1(res[len(res) - 1]) or len(res) == 1: + if _check2(word): + word2 = list(word) + i = 0 + thai = True + if word in [ + "āļ†āđˆāļē", + "āđ€āļ†āļĩāđˆāļĒāļ™", + "āļĻāļķāļ", + "āļĻāļ­āļ", + "āđ€āļĻāļīāļ", + "āđ€āļĻāļĢāđ‰āļē", + "āļ˜", + "āļ“", + "āļŊāļžāļ“āļŊ", + "āđƒāļŦāļāđˆ", + "āļŦāļāđ‰āļē", + "āļ„āļ§āļēāļĒ", + "āļ„āļ§āļēāļĄ", + "āļāļĢāļīāđˆāļ‡āđ€āļāļĢāļ‡", + "āļœāļĨāļī", + ]: # āļ‚āđ‰āļ­āļĒāļāđ€āļ§āđ‰āļ™ āļ„āļģāđ€āļŦāļĨāđˆāļēāļ™āļĩāđ‰āđ€āļ›āđ‡āļ™āļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + return True + + while i < len(word2) and thai: + thai = _check3(word2[i]) + if not thai: + return False + i += 1 + return True + + return False + + if word in ["āļāļ°", "āļāļĢāļ°", "āļ›āļ°", "āļ›āļĢāļ°"]: + return True + + return False diff --git a/pythainlp/util/wordtonum.py b/pythainlp/util/wordtonum.py index 7521ec156..43305d329 100644 --- a/pythainlp/util/wordtonum.py +++ b/pythainlp/util/wordtonum.py @@ -6,6 +6,7 @@ https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q """ import re +from typing import Iterable, List from pythainlp.tokenize import Tokenizer @@ -39,7 +40,7 @@ _TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS) -def _thaiword_to_num(tokens): +def _thaiword_to_num(tokens: List[str]) -> int: if not tokens: return None @@ -65,21 +66,21 @@ def _thaiword_to_num(tokens): return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:]) -def thaiword_to_num(thaiword): +def thaiword_to_num(word: str) -> int: """ - Converts a thai word to number + Converts a Thai number spellout word to actual number value - :param str thaiword: input thai word + :param str word: a Thai number spellout :return: number """ - if not thaiword: + if not word: return None tokens = [] - if isinstance(thaiword,str): - tokens = _TOKENIZER.word_tokenize(thaiword) - elif isinstance(thaiword,list) or isinstance(thaiword,tuple) or isinstance(thaiword,set) or isinstance(thaiword,frozenset): - for w in thaiword: + if isinstance(word, str): + tokens = _TOKENIZER.word_tokenize(word) + elif isinstance(word, Iterable): + for w in word: tokens.extend(_TOKENIZER.word_tokenize(w)) res = [] diff --git a/pythainlp/word_vector/__init__.py b/pythainlp/word_vector/__init__.py index d035e5395..d1da4a2e3 100644 --- a/pythainlp/word_vector/__init__.py +++ b/pythainlp/word_vector/__init__.py @@ -4,6 +4,8 @@ thai2fit - Thai word vector Code by https://github.com/cstorm125/thai2fit """ +from typing import List + import numpy as np from gensim.models import KeyedVectors from pythainlp.corpus import download as download_data @@ -13,7 +15,7 @@ WV_DIM = 300 -def _download(): +def _download() -> str: path = get_corpus_path("thai2fit_wv") if not path: download_data("thai2fit_wv") @@ -33,7 +35,7 @@ def get_model(): _MODEL = get_model() -def most_similar_cosmul(positive: list, negative: list): +def most_similar_cosmul(positive: List[str], negative: List[str]): """ Word arithmetic operations If a word is not in the vocabulary, KeyError will be raised. @@ -47,18 +49,18 @@ def most_similar_cosmul(positive: list, negative: list): return _MODEL.most_similar_cosmul(positive=positive, negative=negative) -def doesnt_match(listdata): +def doesnt_match(words: List[str]) -> str: """ Pick one word that doesn't match other words in the list If a word is not in the vocabulary, KeyError will be raised. - :param list listdata: a list of words + :param list words: a list of words :return: word that doesn't match """ - return _MODEL.doesnt_match(listdata) + return _MODEL.doesnt_match(words) -def similarity(word1, word2): +def similarity(word1: str, word2: str) -> float: """ Get cosine similarity between two words. If a word is not in the vocabulary, KeyError will be raised. @@ -70,7 +72,7 @@ def similarity(word1, word2): return _MODEL.similarity(word1, word2) -def sentence_vectorizer(text, use_mean=True): +def sentence_vectorizer(text: str, use_mean: bool = True): """ Get sentence vector from text If a word is not in the vocabulary, KeyError will be raised. diff --git a/requirements.txt b/requirements.txt index 3159b92b1..7fd66ad78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -marisa-trie -nltk>=3.2.2 dill -marisa_trie +marisa-trie nltk>=3.2.2 pytz requests diff --git a/setup.cfg b/setup.cfg index 350779304..bb022e678 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0 +current_version = 2.0.1 commit = True tag = True diff --git a/setup.py b/setup.py index a47948438..2fe1ac65f 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ "ipa": ["epitran"], "ml": ["fastai>=1.0.38", "keras", "numpy", "torch"], "ner": ["sklearn-crfsuite"], - "thai2fit": ["gensim", "numpy","emoji"], + "thai2fit": ["emoji", "gensim", "numpy"], "thai2rom": ["keras", "numpy"], "full": [ "artagger", @@ -34,7 +34,7 @@ setup( name="pythainlp", - version="2.0", + version="2.0.1", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", @@ -54,16 +54,12 @@ "stopwords_th.txt", "syllables_th.txt", "tha-wn.db", - "new-thaidict.txt", - "negation.txt", - "provinces.csv", - "pt_tagger_1.dill", - "ud_thai-pud_pt_tagger.dill", - "ud_thai-pud_unigram_tagger.dill", - "unigram_tagger.dill", - "words_th.txt", + "thailand_provinces_th.txt", + "tnc_freq.txt", + "ud_thai_pud_pt_tagger.dill", + "ud_thai_pud_unigram_tagger.dill", "words_th_frozen_201810.txt", - "tnc_freq.txt" + "words_th.txt", ], }, include_package_data=True, @@ -77,6 +73,8 @@ "natural language processing", "text analytics", "ThaiNLP", + "text processing", + "localization", ], classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tests/__init__.py b/tests/__init__.py index 6ba23adda..e569951cd 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -20,11 +20,10 @@ tnc, ttc, wordnet, - download + download, ) from pythainlp.soundex import lk82, metasound, soundex, udom83 -from pythainlp.spell import correct, spell -from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob +from pythainlp.spell import correct, spell, NorvigSpellChecker from pythainlp.summarize import summarize from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram from pythainlp.tag.locations import tag_provinces @@ -37,7 +36,7 @@ multi_cut, newmm, dict_trie, - Tokenizer + Tokenizer, ) from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.tokenize import ( @@ -58,9 +57,9 @@ digit_to_text, eng_to_thai, find_keyword, - is_thai, - is_thaichar, - is_thaiword, + countthai, + isthai, + isthaichar, normalize, now_reign_year, num_to_thaiword, @@ -72,9 +71,9 @@ thai_strftime, thai_to_eng, thaiword_to_num, - thaicheck + thaicheck, ) -#from pythainlp.ulmfit import rm_brackets + class TestUM(unittest.TestCase): """ @@ -177,10 +176,6 @@ def test_spell(self): self.assertEqual(correct(""), "") self.assertIsNotNone(correct("āļ—āļ”āļŠāļ­āļ‡")) - self.assertIsNotNone(dictionary()) - self.assertGreaterEqual(prob("āļĄāļĩ"), 0) - self.assertIsNotNone(known(["āđ€āļāļīāļ”", "abc", ""])) - checker = NorvigSpellChecker(dict_filter="") self.assertIsNotNone(checker.dictionary()) self.assertGreaterEqual(checker.prob("āļĄāļĩ"), 0) @@ -262,7 +257,13 @@ def test_ner(self): self.assertEqual(ner.get_ner(""), []) self.assertIsNotNone(ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē")) self.assertIsNotNone(ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē", pos=False)) - self.assertIsNotNone(ner.get_ner("āļ„āļ“āļ°āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒāļ›āļĢāļ°āļĒāļļāļāļ•āđŒāđāļĨāļ°āļ§āļīāļĻāļ§āļāļĢāļĢāļĄāļĻāļēāļŠāļ•āļĢāđŒ āļ—āļĩāđˆāļ­āļĒāļđāđˆ āļĄāļŦāļēāļ§āļīāļ—āļĒāļēāļĨāļąāļĒāļ‚āļ­āļ™āđāļāđˆāļ™ āļ§āļīāļ—āļĒāļēāđ€āļ‚āļ•āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 112 āļŦāļĄāļđāđˆ 7 āļšāđ‰āļēāļ™āļŦāļ™āļ­āļ‡āđ€āļ”āļīāđˆāļ™ āļ•āļģāļšāļĨāļŦāļ™āļ­āļ‡āļāļ­āļĄāđ€āļāļēāļ° āļ­āļģāđ€āļ āļ­āđ€āļĄāļ·āļ­āļ‡ āļˆāļąāļ‡āļŦāļ§āļąāļ”āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 43000")) + self.assertIsNotNone( + ner.get_ner( + """āļ„āļ“āļ°āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒāļ›āļĢāļ°āļĒāļļāļāļ•āđŒāđāļĨāļ°āļ§āļīāļĻāļ§āļāļĢāļĢāļĄāļĻāļēāļŠāļ•āļĢāđŒ āļĄāļŦāļēāļ§āļīāļ—āļĒāļēāļĨāļąāļĒāļ‚āļ­āļ™āđāļāđˆāļ™ + āļ§āļīāļ—āļĒāļēāđ€āļ‚āļ•āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 112 āļŦāļĄāļđāđˆ 7 āļšāđ‰āļēāļ™āļŦāļ™āļ­āļ‡āđ€āļ”āļīāđˆāļ™ āļ•āļģāļšāļĨāļŦāļ™āļ­āļ‡āļāļ­āļĄāđ€āļāļēāļ° āļ­āļģāđ€āļ āļ­āđ€āļĄāļ·āļ­āļ‡ + āļˆāļąāļ‡āļŦāļ§āļąāļ”āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 43000""" + ) + ) # self.assertEqual( # ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē"), # [ @@ -339,8 +340,9 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize("āļ—āļ”āļŠāļ­āļš", engine="XX")) self.assertIsNotNone(word_tokenize("āļ—āļ”āļŠāļ­āļš", engine="deepcut")) self.assertIsNotNone(word_tokenize("", engine="deepcut")) + def test_Tokenizer(self): - t_test=Tokenizer() + t_test = Tokenizer() self.assertEqual(t_test.word_tokenize(""), []) def test_word_tokenize_icu(self): @@ -399,7 +401,8 @@ def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) self.assertEqual( - sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē ", engine="whitespace"), ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē", ""] + sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē ", engine="whitespace"), + ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē", ""], ) self.assertEqual(sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē "), ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē"]) @@ -416,9 +419,9 @@ def test_syllable_tokenize(self): ) def test_tcc(self): - self.assertEqual(tcc.tcc(None), "") - self.assertEqual(tcc.tcc(""), "") - self.assertEqual(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), "āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ") + self.assertEqual(tcc.tcc(None), []) + self.assertEqual(tcc.tcc(""), []) + self.assertEqual(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), ["āļ›", "āļĢāļ°", "āđ€āļ—", "āļĻ", "āđ„āļ—", "āļĒ"]) self.assertEqual(list(tcc.tcc_gen("")), []) self.assertEqual(tcc.tcc_pos(""), set()) @@ -558,20 +561,24 @@ def test_normalize(self): # ### pythainlp.util.thai - def test_is_thai(self): - self.assertEqual(is_thai("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), {"thai": 100.0}) - self.assertIsNotNone(is_thai("āđ€āļœāļ·āļ­āļ", check_all=True)) - self.assertIsNotNone(is_thai("āđ€āļœāļ·āļ­āļabc", check_all=True)) - - def test_is_thaichar(self): - self.assertEqual(is_thaichar("āļ"), True) - self.assertEqual(is_thaichar("a"), False) - self.assertEqual(is_thaichar("0"), False) - - def test_is_thaiword(self): - self.assertEqual(is_thaiword("āđ„āļ—āļĒ"), True) - self.assertEqual(is_thaiword("āļ•.āļ„."), True) - self.assertEqual(is_thaiword("āđ„āļ—āļĒ0"), False) + def test_countthai(self): + self.assertEqual(countthai(""), 0) + self.assertEqual(countthai("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), 100.0) + self.assertEqual(countthai("(āļāļāļ•.)", ".()"), 100.0) + self.assertEqual(countthai("(āļāļāļ•.)", None), 50.0) + + def test_isthaichar(self): + self.assertEqual(isthaichar("āļ"), True) + self.assertEqual(isthaichar("a"), False) + self.assertEqual(isthaichar("0"), False) + + def test_isthai(self): + self.assertEqual(isthai("āđ„āļ—āļĒ"), True) + self.assertEqual(isthai("āđ„āļ—āļĒ0"), False) + self.assertEqual(isthai("āļ•.āļ„."), True) + self.assertEqual(isthai("(āļ•.āļ„.)"), False) + self.assertEqual(isthai("āļ•.āļ„.", ignore_chars=None), False) + self.assertEqual(isthai("(āļ•.āļ„.)", ignore_chars=".()"), True) def test_is_thaicheck(self): self.assertEqual(thaicheck("āļ•āļē"), True) @@ -608,5 +615,6 @@ def test_thai2vec(self): word_vector.doesnt_match(["āļāļĩāđˆāļ›āļļāđˆāļ™", "āļžāļĄāđˆāļē", "āđ„āļ­āļ•āļīāļĄ"]), "āđ„āļ­āļ•āļīāļĄ" ) + if __name__ == "__main__": unittest.main()