From c6a39d2ed77de81c62c33c7b54fec15ea94f3808 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 16 Nov 2019 16:59:02 +0000
Subject: [PATCH 1/6] Remove artagger

---
 README.md                      |  2 --
 appveyor.docs.yml              |  4 ++--
 appveyor.yml                   |  2 --
 docs/api/tag.rst               |  6 +-----
 docs/notes/installation.rst    |  1 -
 pythainlp/tag/__init__.py      | 36 +++++++---------------------------
 pythainlp/tokenize/__init__.py |  2 +-
 setup.py                       |  2 --
 tests/test_tag.py              |  8 --------
 9 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index a820eeda1..9f838989a 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,6 @@ pip install pythainlp[extra1,extra2,...]
 ```
 
 where `extras` can be
-  - `artagger` (to support artagger part-of-speech tagger)
   - `attacut` (to support attacut, a fast and accurate tokenizer)
   - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
   - `ipa` (for IPA, International Phonetic Alphabet, support in transliteration)
@@ -177,7 +176,6 @@ pip install pythainlp[extra1,extra2,...]
 ```
 
 โดยที่ `extras` คือ
-  - `artagger` (สำหรับตัวติดป้ายกำกับชนิดคำ artagger)
   - `attacut` (ตัวตัดคำที่แม่นกว่า `newmm` เมื่อเทียบกับชุดข้อมูล BEST)
   - `icu` (สำหรับการถอดตัวสะกดเป็นสัทอักษรและการตัดคำด้วย ICU)
   - `ipa` (สำหรับการถอดตัวสะกดเป็นสัทอักษรสากล (IPA))
diff --git a/appveyor.docs.yml b/appveyor.docs.yml
index a566e1fe9..34c1dcea8 100644
--- a/appveyor.docs.yml
+++ b/appveyor.docs.yml
@@ -42,8 +42,8 @@ install:
   - export LD_LIBRARY_PATH=/usr/local/lib
   - sudo pip3 install -r requirements.txt
   - sudo pip3 install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-  - sudo pip3 install --upgrade artagger emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg
-  - sudo pip3 install --upgrade "tensorflow==1.14,<2"deepcut
+  - sudo pip3 install --upgrade emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg
+  - sudo pip3 install --upgrade "tensorflow>=2,<3"deepcut
   - sudo pip3 install --upgrade boto smart_open sphinx sphinx-rtd-theme
 
 #---------------------------------#
diff --git a/appveyor.yml b/appveyor.yml
index d0e7a45c0..2f573348a 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -48,7 +48,6 @@ environment:
     PYTHONIOENCODING: "utf-8"
     ICU_VERSION: "64.2"
     DISTUTILS_USE_SDK: "1"
-    ARTAGGER_PKG: "https://github.com/franziz/artagger/archive/master.zip"
     PYTHAINLP_DATA_DIR: "%LOCALAPPDATA%/pythainlp-data"
 
   matrix:
@@ -101,7 +100,6 @@ install:
   - pip install "tensorflow>=2,<3" deepcut
   - pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
   - pip install %PYICU_PKG%
-  - pip install %ARTAGGER_PKG%
   - pip install -e .[full]
 
 #---------------------------------#
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
index 5ae2f8b66..fcba8145a 100644
--- a/docs/api/tag.rst
+++ b/docs/api/tag.rst
@@ -207,14 +207,10 @@ unigram
 
 Unigram tagger doesn't take the ordering of words in the list into account.
 
-artagger
-++++++++
-
-`artagger <https://github.com/franziz/artagger>`_ is an implementation of `RDRPOSTagger <https://github.com/datquocnguyen/RDRPOSTagger>`_ for tagging POS in Thai language.
 
 References
 ----------
 
 .. [#Sornlertlamvanich_2000] Takahashi, Naoto & Isahara, Hitoshi & Sornlertlamvanich, Virach. (2000).
             Building a Thai part-of-speech tagged corpus (ORCHID). 
-            ournal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. 
+            Journal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. 
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
index d25ab6e2b..fa17a280b 100644
--- a/docs/notes/installation.rst
+++ b/docs/notes/installation.rst
@@ -14,7 +14,6 @@ For some functionalities, like named entity recognition, extra packages may be n
     pip install pythainlp[extra1,extra2,...]
 
 where ``extras`` can be
-  - ``artagger`` (to support artagger part-of-speech tagger)
   - ``attacut`` (to support attacut, a fast and accurate tokenizer)
   - ``icu`` (for ICU, International Components for Unicode, support in transliteration and tokenization)
   - ``ipa`` (for IPA, International Phonetic Alphabet, support in transliteration)
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index 45fa828e7..a636bdc48 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -104,23 +104,14 @@ def _orchid_to_ud(tag) -> List[Tuple[str, str]]:
     _i = 0
     temp = []
     while _i < len(tag):
-        temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]])))
+        temp.append(
+            (tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))
+        )
         _i += 1
 
     return temp
 
 
-def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]:
-    if not words:
-        return []
-
-    from artagger import Tagger
-
-    words_ = Tagger().tag(" ".join(words))
-
-    return [(word.word, word.tag) for word in words_]
-
-
 def pos_tag(
     words: List[str], engine: str = "perceptron", corpus: str = "orchid"
 ) -> List[Tuple[str, str]]:
@@ -132,7 +123,6 @@ def pos_tag(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
-        * *artagger* - RDR POS tagger
     :param str corpus:
         * *orchid* - annotated Thai academic articles namedly
           `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_ (default)
@@ -145,10 +135,6 @@ def pos_tag(
     :return: returns a list of labels regarding which part of speech it is
     :rtype: list[tuple[str, str]]
 
-    :Note:
-        * *artagger*, only support one sentence and the sentence must
-          be tokenized beforehand.
-
     :Example:
 
     Tag words with corpus `orchid` (default)::
@@ -187,8 +173,7 @@ def pos_tag(
         #   ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'),
         #   ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')]
 
-    Tag words with different engines including *perceptron*, *unigram*,
-    and *artagger*::
+    Tag words with different engines including *perceptron* and *unigram*::
 
         from pythainlp.tag import pos_tag
 
@@ -204,12 +189,6 @@ def pos_tag(
         # output:
         # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
         #   ('<space>', None), ('<equal>', None), ('3', 'NUM')]
-
-        pos_tag(words, engine='artagger', corpus='orchid')
-        # output:
-        # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'),
-        #   ('ขา', 'NCMN'), ('<space>', 'PUNC'),
-        #   ('<equal>', 'PUNC'), ('3', 'NCNM')]
     """
 
     # NOTE:
@@ -222,8 +201,6 @@ def pos_tag(
 
     if engine == "perceptron":
         from .perceptron import tag as tag_
-    elif engine == "artagger":
-        tag_ = _artagger_tag
     else:  # default, use "unigram" ("old") engine
         from .unigram import tag as tag_
     _tag = tag_(words, corpus=corpus)
@@ -235,7 +212,9 @@ def pos_tag(
 
 
 def pos_tag_sents(
-    sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid"
+    sentences: List[List[str]],
+    engine: str = "perceptron",
+    corpus: str = "orchid",
 ) -> List[List[Tuple[str, str]]]:
     """
     The function tag multiple list of tokenized words into Part-of-Speech
@@ -245,7 +224,6 @@ def pos_tag_sents(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
-        * *artagger* - RDR POS tagger
     :param str corpus:
         * *orchid* - annotated Thai academic articles namedly\
             `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_\
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index cbc36d2a0..de8f18669 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -418,7 +418,7 @@ class Tokenizer:
         #   'ผิดปกติ', 'ของ', 'การ', 'พูด']
 
     Tokenizer object instantiated with a file path containing list of
-    word separated with *newline*  and explicitly set a new tokeneizer
+    word separated with *newline* and explicitly set a new tokenizer
     after initiation::
 
         PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt'
diff --git a/setup.py b/setup.py
index 2d9c74efb..a2bb42b26 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,6 @@
 ]
 
 extras = {
-    "artagger": ["artagger>=0.1.0.3"],
     "attacut": ["attacut>=1.0.4"],
     "benchmarks": ["numpy>=1.16", "pandas>=0.24"],
     "icu": ["pyicu>=2.3"],
@@ -54,7 +53,6 @@
     "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16"],
     "thai2rom": ["torch>=1.0.0", "numpy>=1.16"],
     "full": [
-        "artagger>=0.1.0.3",
         "attacut>=1.0.4",
         "emoji>=0.5.1",
         "epitran>=1.1",
diff --git a/tests/test_tag.py b/tests/test_tag.py
index 6a6f2dc0f..fe09839d1 100644
--- a/tests/test_tag.py
+++ b/tests/test_tag.py
@@ -44,14 +44,6 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
         self.assertEqual(perceptron.tag([], corpus="orchid"), [])
 
-        self.assertIsNotNone(pos_tag(None, engine="artagger"))
-        self.assertIsNotNone(pos_tag([], engine="artagger"))
-        self.assertIsNotNone(pos_tag(tokens, engine="artagger"))
-        self.assertEqual(
-            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"),
-            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
-        )
-
         self.assertEqual(pos_tag_sents(None), [])
         self.assertEqual(pos_tag_sents([]), [])
         self.assertEqual(

From 4a8b55ac2b7818ff4b40f1cb708c0354ab7a542f Mon Sep 17 00:00:00 2001
From: bact <arthit@gmail.com>
Date: Mon, 25 Nov 2019 08:16:57 +0000
Subject: [PATCH 2/6] bump attacut version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a2bb42b26..3d9b71bb4 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@
 ]
 
 extras = {
-    "attacut": ["attacut>=1.0.4"],
+    "attacut": ["attacut>=1.0.6"],
     "benchmarks": ["numpy>=1.16", "pandas>=0.24"],
     "icu": ["pyicu>=2.3"],
     "ipa": ["epitran>=1.1"],

From 0430f6b61ec81cbf649c754cedda2a940c28af13 Mon Sep 17 00:00:00 2001
From: bact <arthit@gmail.com>
Date: Mon, 25 Nov 2019 23:20:58 +0000
Subject: [PATCH 3/6] fix typo "sounddex" -> "soundex"

---
 pythainlp/cli/soundex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py
index a983cad83..a7f427d24 100644
--- a/pythainlp/cli/soundex.py
+++ b/pythainlp/cli/soundex.py
@@ -7,7 +7,7 @@
 class App:
 
     def __init__(self, argv):
-        parser = argparse.ArgumentParser("sounddex")
+        parser = argparse.ArgumentParser("soundex")
         parser.add_argument(
             "--text",
             type=str,

From 69f231986ed4f23a0c9e40bf76ec58d9e3a97945 Mon Sep 17 00:00:00 2001
From: bact <arthit@gmail.com>
Date: Mon, 25 Nov 2019 23:25:12 +0000
Subject: [PATCH 4/6] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f838989a..7d87b2da6 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil
 **This is a document for development branch (post 2.0). Things will break.**
 
 - The latest stable release is [2.0.7](https://github.com/PyThaiNLP/pythainlp/releases)
-- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
+- The latest development release is [2.1.dev8](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
 - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page
 
 

From 3921e66efaafd68079bb18ee77cb8fc64d6e5a13 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 5 Dec 2019 22:01:01 +0700
Subject: [PATCH 5/6] =?UTF-8?q?Update=20words=5Fth.txt=20:=20del=20?=
 =?UTF-8?q?=E0=B8=8A=E0=B8=B4=E0=B8=8A=E0=B8=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pythainlp/corpus/words_th.txt | 1 -
 tests/test_tokenize.py        | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt
index 7fc7e0131..4feccea0f 100755
--- a/pythainlp/corpus/words_th.txt
+++ b/pythainlp/corpus/words_th.txt
@@ -15594,7 +15594,6 @@
 ชิงไหวชิงพริบ
 ชิงฮื้อ
 ชิชะ
-ชิชิ
 ชิณณะ
 ชิด
 ชิดขวา
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index d47b966b3..21b9224f2 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -208,6 +208,13 @@ def test_word_tokenize_newmm(self):
         self.assertIsNotNone(
             word_tokenize(long_danger_text, engine="newmm-safe")
         )
+        text = "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ"
+        self.assertIsNotNone(
+            word_tokenize(text, engine="newmm")
+        )
+        self.assertIsNotNone(
+            word_tokenize(text, engine="newmm-safe")
+        )
 
     def test_word_tokenize_attacut(self):
         self.assertEqual(attacut.segment(None), [])

From fbc87a5b87d481bdc3671537e19d5da542179551 Mon Sep 17 00:00:00 2001
From: bact <arthit@gmail.com>
Date: Fri, 6 Dec 2019 16:17:24 +0000
Subject: [PATCH 6/6] Update test_tokenize.py

---
 tests/test_tokenize.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 21b9224f2..eec517826 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -196,24 +196,26 @@ def test_word_tokenize_newmm(self):
         self.assertIsNotNone(word_tokenize(long_text, engine="newmm"))
         self.assertIsNotNone(word_tokenize(long_text, engine="newmm-safe"))
 
-        short_danger_text = """
+        danger_text1 = """
+    ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ
+    """
+        danger_text2 = """
     ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน
     """
-        long_danger_text = """
+        danger_text3 = """
     ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก
     """
         self.assertIsNotNone(
-            word_tokenize(short_danger_text, engine="newmm-safe")
+            word_tokenize(danger_text1, engine="newmm")
         )
         self.assertIsNotNone(
-            word_tokenize(long_danger_text, engine="newmm-safe")
+            word_tokenize(danger_text1, engine="newmm-safe")
         )
-        text = "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ"
         self.assertIsNotNone(
-            word_tokenize(text, engine="newmm")
+            word_tokenize(danger_text2, engine="newmm-safe")
         )
         self.assertIsNotNone(
-            word_tokenize(text, engine="newmm-safe")
+            word_tokenize(danger_text3, engine="newmm-safe")
         )
 
     def test_word_tokenize_attacut(self):