From c6a39d2ed77de81c62c33c7b54fec15ea94f3808 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 16 Nov 2019 16:59:02 +0000 Subject: [PATCH 1/6] Remove artagger --- README.md | 2 -- appveyor.docs.yml | 4 ++-- appveyor.yml | 2 -- docs/api/tag.rst | 6 +----- docs/notes/installation.rst | 1 - pythainlp/tag/__init__.py | 36 +++++++--------------------------- pythainlp/tokenize/__init__.py | 2 +- setup.py | 2 -- tests/test_tag.py | 8 -------- 9 files changed, 11 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index a820eeda1..9f838989a 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,6 @@ pip install pythainlp[extra1,extra2,...] ``` where `extras` can be - - `artagger` (to support artagger part-of-speech tagger) - `attacut` (to support attacut, a fast and accurate tokenizer) - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization) - `ipa` (for IPA, International Phonetic Alphabet, support in transliteration) @@ -177,7 +176,6 @@ pip install pythainlp[extra1,extra2,...] ``` โดยที่ `extras` คือ - - `artagger` (สำหรับตัวติดป้ายกำกับชนิดคำ artagger) - `attacut` (ตัวตัดคำที่แม่นกว่า `newmm` เมื่อเทียบกับชุดข้อมูล BEST) - `icu` (สำหรับการถอดตัวสะกดเป็นสัทอักษรและการตัดคำด้วย ICU) - `ipa` (สำหรับการถอดตัวสะกดเป็นสัทอักษรสากล (IPA)) diff --git a/appveyor.docs.yml b/appveyor.docs.yml index a566e1fe9..34c1dcea8 100644 --- a/appveyor.docs.yml +++ b/appveyor.docs.yml @@ -42,8 +42,8 @@ install: - export LD_LIBRARY_PATH=/usr/local/lib - sudo pip3 install -r requirements.txt - sudo pip3 install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - - sudo pip3 install --upgrade artagger emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg - - sudo pip3 install --upgrade "tensorflow==1.14,<2"deepcut + - sudo pip3 install --upgrade emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg + - sudo pip3 install --upgrade "tensorflow>=2,<3"deepcut - sudo pip3 install --upgrade boto smart_open sphinx sphinx-rtd-theme #---------------------------------# diff --git a/appveyor.yml b/appveyor.yml index d0e7a45c0..2f573348a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -48,7 +48,6 @@ environment: PYTHONIOENCODING: "utf-8" ICU_VERSION: "64.2" DISTUTILS_USE_SDK: "1" - ARTAGGER_PKG: "https://github.com/franziz/artagger/archive/master.zip" PYTHAINLP_DATA_DIR: "%LOCALAPPDATA%/pythainlp-data" matrix: @@ -101,7 +100,6 @@ install: - pip install "tensorflow>=2,<3" deepcut - pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install %PYICU_PKG% - - pip install %ARTAGGER_PKG% - pip install -e .[full] #---------------------------------# diff --git a/docs/api/tag.rst b/docs/api/tag.rst index 5ae2f8b66..fcba8145a 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -207,14 +207,10 @@ unigram Unigram tagger doesn't take the ordering of words in the list into account. -artagger -++++++++ - -`artagger `_ is an implementation of `RDRPOSTagger `_ for tagging POS in Thai language. References ---------- .. [#Sornlertlamvanich_2000] Takahashi, Naoto & Isahara, Hitoshi & Sornlertlamvanich, Virach. (2000). Building a Thai part-of-speech tagged corpus (ORCHID). - ournal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. + Journal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index d25ab6e2b..fa17a280b 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -14,7 +14,6 @@ For some functionalities, like named entity recognition, extra packages may be n pip install pythainlp[extra1,extra2,...] where ``extras`` can be - - ``artagger`` (to support artagger part-of-speech tagger) - ``attacut`` (to support attacut, a fast and accurate tokenizer) - ``icu`` (for ICU, International Components for Unicode, support in transliteration and tokenization) - ``ipa`` (for IPA, International Phonetic Alphabet, support in transliteration) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 45fa828e7..a636bdc48 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -104,23 +104,14 @@ def _orchid_to_ud(tag) -> List[Tuple[str, str]]: _i = 0 temp = [] while _i < len(tag): - temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))) + temp.append( + (tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]])) + ) _i += 1 return temp -def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]: - if not words: - return [] - - from artagger import Tagger - - words_ = Tagger().tag(" ".join(words)) - - return [(word.word, word.tag) for word in words_] - - def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: @@ -132,7 +123,6 @@ def pos_tag( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger - * *artagger* - RDR POS tagger :param str corpus: * *orchid* - annotated Thai academic articles namedly `Orchid `_ (default) @@ -145,10 +135,6 @@ def pos_tag( :return: returns a list of labels regarding which part of speech it is :rtype: list[tuple[str, str]] - :Note: - * *artagger*, only support one sentence and the sentence must - be tokenized beforehand. - :Example: Tag words with corpus `orchid` (default):: @@ -187,8 +173,7 @@ def pos_tag( # ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'), # ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')] - Tag words with different engines including *perceptron*, *unigram*, - and *artagger*:: + Tag words with different engines including *perceptron* and *unigram*:: from pythainlp.tag import pos_tag @@ -204,12 +189,6 @@ def pos_tag( # output: # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None), # ('', None), ('', None), ('3', 'NUM')] - - pos_tag(words, engine='artagger', corpus='orchid') - # output: - # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'), - # ('ขา', 'NCMN'), ('', 'PUNC'), - # ('', 'PUNC'), ('3', 'NCNM')] """ # NOTE: @@ -222,8 +201,6 @@ def pos_tag( if engine == "perceptron": from .perceptron import tag as tag_ - elif engine == "artagger": - tag_ = _artagger_tag else: # default, use "unigram" ("old") engine from .unigram import tag as tag_ _tag = tag_(words, corpus=corpus) @@ -235,7 +212,9 @@ def pos_tag( def pos_tag_sents( - sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid" + sentences: List[List[str]], + engine: str = "perceptron", + corpus: str = "orchid", ) -> List[List[Tuple[str, str]]]: """ The function tag multiple list of tokenized words into Part-of-Speech @@ -245,7 +224,6 @@ def pos_tag_sents( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger - * *artagger* - RDR POS tagger :param str corpus: * *orchid* - annotated Thai academic articles namedly\ `Orchid `_\ diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index cbc36d2a0..de8f18669 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -418,7 +418,7 @@ class Tokenizer: # 'ผิดปกติ', 'ของ', 'การ', 'พูด'] Tokenizer object instantiated with a file path containing list of - word separated with *newline* and explicitly set a new tokeneizer + word separated with *newline* and explicitly set a new tokenizer after initiation:: PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt' diff --git a/setup.py b/setup.py index 2d9c74efb..a2bb42b26 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,6 @@ ] extras = { - "artagger": ["artagger>=0.1.0.3"], "attacut": ["attacut>=1.0.4"], "benchmarks": ["numpy>=1.16", "pandas>=0.24"], "icu": ["pyicu>=2.3"], @@ -54,7 +53,6 @@ "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16"], "thai2rom": ["torch>=1.0.0", "numpy>=1.16"], "full": [ - "artagger>=0.1.0.3", "attacut>=1.0.4", "emoji>=0.5.1", "epitran>=1.1", diff --git a/tests/test_tag.py b/tests/test_tag.py index 6a6f2dc0f..fe09839d1 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -44,14 +44,6 @@ def test_pos_tag(self): self.assertEqual(perceptron.tag(None, corpus="orchid"), []) self.assertEqual(perceptron.tag([], corpus="orchid"), []) - self.assertIsNotNone(pos_tag(None, engine="artagger")) - self.assertIsNotNone(pos_tag([], engine="artagger")) - self.assertIsNotNone(pos_tag(tokens, engine="artagger")) - self.assertEqual( - pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"), - [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], - ) - self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) self.assertEqual( From 4a8b55ac2b7818ff4b40f1cb708c0354ab7a542f Mon Sep 17 00:00:00 2001 From: bact Date: Mon, 25 Nov 2019 08:16:57 +0000 Subject: [PATCH 2/6] bump attacut version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a2bb42b26..3d9b71bb4 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ ] extras = { - "attacut": ["attacut>=1.0.4"], + "attacut": ["attacut>=1.0.6"], "benchmarks": ["numpy>=1.16", "pandas>=0.24"], "icu": ["pyicu>=2.3"], "ipa": ["epitran>=1.1"], From 0430f6b61ec81cbf649c754cedda2a940c28af13 Mon Sep 17 00:00:00 2001 From: bact Date: Mon, 25 Nov 2019 23:20:58 +0000 Subject: [PATCH 3/6] fix typo "sounddex" -> "soundex" --- pythainlp/cli/soundex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py index a983cad83..a7f427d24 100644 --- a/pythainlp/cli/soundex.py +++ b/pythainlp/cli/soundex.py @@ -7,7 +7,7 @@ class App: def __init__(self, argv): - parser = argparse.ArgumentParser("sounddex") + parser = argparse.ArgumentParser("soundex") parser.add_argument( "--text", type=str, From 69f231986ed4f23a0c9e40bf76ec58d9e3a97945 Mon Sep 17 00:00:00 2001 From: bact Date: Mon, 25 Nov 2019 23:25:12 +0000 Subject: [PATCH 4/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f838989a..7d87b2da6 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil **This is a document for development branch (post 2.0). Things will break.** - The latest stable release is [2.0.7](https://github.com/PyThaiNLP/pythainlp/releases) -- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181). +- The latest development release is [2.1.dev8](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181). - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page From 3921e66efaafd68079bb18ee77cb8fc64d6e5a13 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 5 Dec 2019 22:01:01 +0700 Subject: [PATCH 5/6] =?UTF-8?q?Update=20words=5Fth.txt=20:=20del=20?= =?UTF-8?q?=E0=B8=8A=E0=B8=B4=E0=B8=8A=E0=B8=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pythainlp/corpus/words_th.txt | 1 - tests/test_tokenize.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt index 7fc7e0131..4feccea0f 100755 --- a/pythainlp/corpus/words_th.txt +++ b/pythainlp/corpus/words_th.txt @@ -15594,7 +15594,6 @@ ชิงไหวชิงพริบ ชิงฮื้อ ชิชะ -ชิชิ ชิณณะ ชิด ชิดขวา diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index d47b966b3..21b9224f2 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -208,6 +208,13 @@ def test_word_tokenize_newmm(self): self.assertIsNotNone( word_tokenize(long_danger_text, engine="newmm-safe") ) + text = "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ" + self.assertIsNotNone( + word_tokenize(text, engine="newmm") + ) + self.assertIsNotNone( + word_tokenize(text, engine="newmm-safe") + ) def test_word_tokenize_attacut(self): self.assertEqual(attacut.segment(None), []) From fbc87a5b87d481bdc3671537e19d5da542179551 Mon Sep 17 00:00:00 2001 From: bact Date: Fri, 6 Dec 2019 16:17:24 +0000 Subject: [PATCH 6/6] Update test_tokenize.py --- tests/test_tokenize.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 21b9224f2..eec517826 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -196,24 +196,26 @@ def test_word_tokenize_newmm(self): self.assertIsNotNone(word_tokenize(long_text, engine="newmm")) self.assertIsNotNone(word_tokenize(long_text, engine="newmm-safe")) - short_danger_text = """ + danger_text1 = """ + ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ + """ + danger_text2 = """ ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน """ - long_danger_text = """ + danger_text3 = """ ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก """ self.assertIsNotNone( - word_tokenize(short_danger_text, engine="newmm-safe") + word_tokenize(danger_text1, engine="newmm") ) self.assertIsNotNone( - word_tokenize(long_danger_text, engine="newmm-safe") + word_tokenize(danger_text1, engine="newmm-safe") ) - text = "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ" self.assertIsNotNone( - word_tokenize(text, engine="newmm") + word_tokenize(danger_text2, engine="newmm-safe") ) self.assertIsNotNone( - word_tokenize(text, engine="newmm-safe") + word_tokenize(danger_text3, engine="newmm-safe") ) def test_word_tokenize_attacut(self):