diff --git a/README-pypi.md b/README-pypi.md index 886a37edb..b779f157e 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -1,6 +1,6 @@ ![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4) -# PyThaiNLP 2.0 +# PyThaiNLP 2.0.2 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade)[![pypi](https://img.shields.io/pypi/v/pythainlp.svg)](https://pypi.python.org/pypi/pythainlp) [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp) @@ -12,9 +12,9 @@ PyThaiNLP is a Python library for natural language processing (NLP) of Thai lang PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers. -📖 For details on upgrading from PyThaiNLP 1.7 to PyThaiNLP 2.0, see [From PyThaiNLP 1.7 to PyThaiNLP 2.0](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) +📖 [Upgrading from PyThaiNLP 1.7 to 2.0](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) -📖 For ThaiNER user after upgrading from PyThaiNLP 1.7 to PyThaiNLP 2.0, see [Upgrade ThaiNER from PyThaiNLP 1.7 to PyThaiNLP 2.0](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) +📖 [Upgrade ThaiNER from PyThaiNLP 1.7 to 2.0](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) ðŸ“Ŧ follow us on Facebook [Pythainlp](https://www.facebook.com/pythainlp/) diff --git a/README.md b/README.md index fd2c26acb..e25c27d97 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,10 @@ Thai Natural Language Processing in Python. PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language. - [Current PyThaiNLP stable release is 2.0](https://github.com/PyThaiNLP/pythainlp/tree/master) -- PyThaiNLP 2.0 will support only Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). -- Python 2 users can use PyThaiNLP 1.6, our latest released that tested with Python 2.7. +- PyThaiNLP 2.0 supports Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). +- Python 2.7+ users can use PyThaiNLP 1.6. -**This is a document for development branch (post 1.7.x). Things will break. For a stable branch document, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** +**This is a document for development branch (post 2.0). Things will break. For a stable branch document, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** ðŸ“Ŧ follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/) @@ -102,13 +102,10 @@ PyThaiNLP āđ€āļ›āđ‡āļ™āđ„āļĨāļšāļēāļĢāļĩāļ āļēāļĐāļēāđ„āļžāļ—āļ­āļ™āđ€āļžāļ·āđˆ > āđ€āļžāļĢāļēāļ°āđ‚āļĨāļāļ‚āļąāļšāđ€āļ„āļĨāļ·āđˆāļ­āļ™āļ•āđˆāļ­āđ„āļ›āļ”āđ‰āļ§āļĒāļāļēāļĢāđāļšāđˆāļ‡āļ›āļąāļ™ -āļĢāļ­āļ‡āļĢāļąāļš Python 3.6 āļ‚āļķāđ‰āļ™āđ„āļ› +- PyThaiNLP 2.0 āļĢāļ­āļ‡āļĢāļąāļš Python 3.6 āļ‚āļķāđ‰āļ™āđ„āļ› +- āļœāļđāđ‰āđƒāļŠāđ‰ Python 2.7+ āļĒāļąāļ‡āļŠāļēāļĄāļēāļĢāļ–āđƒāļŠāđ‰ PyThaiNLP 1.6 āđ„āļ”āđ‰ -- āļ•āļąāđ‰āļ‡āđāļ•āđˆāļĢāļļāđˆāļ™ 1.7 PyThaiNLP āļˆāļ°āđ€āļĨāļīāļāļŠāļ™āļąāļšāļŠāļ™āļļāļ™ Python 2 (āļšāļēāļ‡āļŸāļąāļ‡āļāđŒāļŠāļąāļ™āļ­āļēāļˆāļĒāļąāļ‡āļ—āļģāļ‡āļēāļ™āđ„āļ”āđ‰ āđāļ•āđˆāļˆāļ°āđ„āļĄāđˆāđ„āļ”āđ‰āļĢāļąāļšāļāļēāļĢāļŠāļ™āļąāļšāļŠāļ™āļļāļ™) -- āļ•āļąāđ‰āļ‡āđāļ•āđˆāļĢāļļāđˆāļ™ 2.0 āļˆāļ°āļĒāļļāļ•āļīāļāļēāļĢāļĢāļ­āļ‡āļĢāļąāļš Python 2 āļ—āļąāđ‰āļ‡āļŦāļĄāļ” -- āļœāļđāđ‰āđƒāļŠāđ‰ Python 2 āļĒāļąāļ‡āļŠāļēāļĄāļēāļĢāļ–āđƒāļŠāđ‰ PyThaiNLP 1.6 āđ„āļ”āđ‰ - -**āđ€āļ­āļāļŠāļēāļĢāļ™āļĩāđ‰āļŠāļģāļŦāļĢāļąāļšāļĢāļļāđˆāļ™āļžāļąāļ’āļ™āļē (āļŦāļĨāļąāļ‡ 1.7.x) āļ­āļēāļˆāļĄāļĩāļāļēāļĢāđ€āļ›āļĨāļĩāđˆāļĒāļ™āđāļ›āļĨāļ‡āđ„āļ”āđ‰āļ•āļĨāļ­āļ” āļŠāļģāļŦāļĢāļąāļšāđ€āļ­āļāļŠāļēāļĢāļĢāļļāđˆāļ™āđ€āļŠāļ–āļĩāļĒāļĢ āļ”āļđāļ—āļĩāđˆ [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** +**āđ€āļ­āļāļŠāļēāļĢāļ™āļĩāđ‰āļŠāļģāļŦāļĢāļąāļšāļĢāļļāđˆāļ™āļžāļąāļ’āļ™āļē (āļŦāļĨāļąāļ‡ 2.0) āļ­āļēāļˆāļĄāļĩāļāļēāļĢāđ€āļ›āļĨāļĩāđˆāļĒāļ™āđāļ›āļĨāļ‡āđ„āļ”āđ‰āļ•āļĨāļ­āļ” āļŠāļģāļŦāļĢāļąāļšāđ€āļ­āļāļŠāļēāļĢāļĢāļļāđˆāļ™āđ€āļŠāļ–āļĩāļĒāļĢ āļ”āļđāļ—āļĩāđˆ [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** ðŸ“Ŧ āļ•āļīāļ”āļ•āļēāļĄāļ‚āđˆāļēāļ§āļŠāļēāļĢāđ„āļ”āđ‰āļ—āļĩāđˆ Facebook [Pythainlp](https://www.facebook.com/pythainlp/) diff --git a/bin/pythainlp b/bin/pythainlp index 3582b89ad..1e3a68691 100644 --- a/bin/pythainlp +++ b/bin/pythainlp @@ -45,4 +45,4 @@ elif args.soundex!=None: args.engine="lk82" print(soundex(args.soundex, engine=args.engine)) else: - print("PyThaiNLP 2.0") + print("PyThaiNLP 2.0.2") diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index ff5babfff..f25188849 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "2.0.1" %} +{% set version = "2.0.2" %} package: name: pythainlp diff --git a/meta.yaml b/meta.yaml index 651053e4e..714ecb262 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "2.0.1" %} +{% set version = "2.0.2" %} package: name: pythainlp diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 9ab5ff1ad..21a18a9c1 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ ïŧŋ# -*- coding: utf-8 -*- -__version__ = "2.0.1" +__version__ = "2.0.2" thai_consonants = "āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ" # 44 chars thai_vowels = "āļĪāļĶāļ°\u0e31āļēāļģ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39āđ€āđāđ‚āđƒāđ„\u0e45\u0e47" # 19 diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 985991415..6f788aaf0 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -54,7 +54,6 @@ "DIAC": "DET", "DIBQ": "DET", "DIAQ": "DET", - "DCNM": "DET", # NUM "NUM": "NUM", "NCNM": "NUM", diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 296460857..b87cf13e5 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -122,17 +122,25 @@ def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: """ :param str text: text to be tokenized - :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. + :param str engine: subword tokenizer + :Parameters for engine: + * tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000) + * etcc - Enhanced Thai Character Cluster (Inrut et al. 2001) [In development] :return: a list of tokenized strings. """ if not text: return "" from .tcc import tcc + from .etcc import etcc + if engine == "tcc": + return tcc(text) + elif engine == "etcc": + return etcc(text).split("/") + #default return tcc(text) - def syllable_tokenize(text: str) -> List[str]: """ :param str text: input string to be tokenized diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index 986878001..1df6eaaec 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -3,6 +3,8 @@ āđ‚āļ›āļĢāđāļāļĢāļĄ ETCC āđƒāļ™ Python āļžāļąāļ’āļ™āļēāđ‚āļ”āļĒ āļ™āļēāļĒ āļ§āļĢāļĢāļ“āļžāļ‡āļĐāđŒ āļ āļąāļ—āļ—āļīāļĒāđ„āļžāļšāļđāļĨāļĒāđŒ 19 āļĄāļī.āļĒ. 2560 +Reference: Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. "Thai word segmentation using combination of forward and backward longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. + āļ§āļīāļ˜āļĩāđƒāļŠāđ‰āļ‡āļēāļ™ etcc(āļ„āļģ) diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index 8ef125217..ee945e929 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -1,8 +1,9 @@ ïŧŋ# -*- coding: utf-8 -*- """ Separate Thai text into Thai Character Cluster (TCC). -Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002) -http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548 +Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2000) +https://dl.acm.org/citation.cfm?id=355225 +http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548 Credits: - TCC: Jakkrit TeCho diff --git a/setup.cfg b/setup.cfg index bb022e678..809721c80 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.1 +current_version = 2.0.2 commit = True tag = True diff --git a/setup.py b/setup.py index 2fe1ac65f..879e9b93f 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ setup( name="pythainlp", - version="2.0.1", + version="2.0.2", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown",