From 2b2ef0bdb9f71d33bf7a04d42289e4560af298a0 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 15 Apr 2019 18:53:48 +0200
Subject: [PATCH 1/3] Makes custom dictionary arguments more consistent across
 different engine. Handles Trie, Iterable[str], and str (path to dictionary).

---
 pythainlp/tokenize/__init__.py  | 60 +++++++++++++++++++++++----------
 pythainlp/tokenize/deepcut.py   | 17 +++++++---
 pythainlp/tokenize/longest.py   | 18 ++++++----
 pythainlp/tokenize/multi_cut.py | 27 +++++++++------
 pythainlp/tokenize/newmm.py     | 17 +++++-----
 5 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 40ae585c7..6587e9bb7 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -47,7 +47,7 @@ def word_tokenize(
         from .newmm import segment as segment_
 
         def segment(text):
-            return segment_(text, trie=FROZEN_DICT_TRIE)
+            return segment_(text, custom_dict=FROZEN_DICT_TRIE)
 
     elif engine == "icu":
         from .pyicu import segment
@@ -58,20 +58,26 @@ def segment(text):
     else:  # default, use "newmm" engine
         from .newmm import segment
 
-    if not whitespaces:
-        return [token.strip(" ") for token in segment(text) if token.strip(" ")]
+    segments = segment(text)
 
-    return segment(text)
+    if whitespaces:
+        return segments
+
+    return [token.strip(" ") for token in segments if token.strip(" ")]
 
 
 def dict_word_tokenize(
-    text: str, custom_dict: Trie, engine: str = "newmm"
+    text: str,
+    custom_dict: Union[Trie, Iterable[str], str] = DEFAULT_DICT_TRIE,
+    engine: str = "newmm",
+    whitespaces: bool = True,
 ) -> List[str]:
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
     :param str text: text to be tokenized
-    :param dict custom_dict: a dictionary trie
-    :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut)
+    :param dict custom_dict: a dictionary trie, or an iterable of words, or a string of dictionary path
+    :param str engine: choose between different options of engine to token (newmm [default], mm, longest, and deepcut)
+    :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai
     :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie
@@ -86,16 +92,32 @@ def dict_word_tokenize(
 
     if engine == "newmm" or engine == "onecut":
         from .newmm import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "deepcut":
         from .deepcut import segment
-        return segment(text,list(custom_dict))
+
+        if not isinstance(custom_dict, List) and not isinstance(custom_dict, str):
+            custom_dict = list(custom_dict)
     else:  # default, use "newmm" engine
         from .newmm import segment
-    return segment(text, custom_dict)
+
+        custom_dict = dict_trie(custom_dict)
+
+    segments = segment(text, custom_dict)
+
+    if whitespaces:
+        return segments
+
+    return [token.strip(" ") for token in segments if token.strip(" ")]
 
 
 def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
@@ -135,11 +157,8 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
 
     if engine == "etcc":
         from .etcc import segment
-
-        return segment(text)
-
-    # default is "tcc"
-    from .tcc import segment
+    else:  # default
+        from .tcc import segment
 
     return segment(text)
 
@@ -164,7 +183,7 @@ def syllable_tokenize(text: str) -> List[str]:
     return tokens
 
 
-def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
+def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
     """
     Create a dict trie which will be used for word_tokenize() function.
     For more information on the trie data structure,
@@ -173,20 +192,25 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
     :param string/list dict_source: a list of vocaburaries or a path to source file
     :return: a trie created from a dictionary input
     """
+    trie = None
 
     if type(dict_source) is str:
         # Receive a file path of the dict to read
         with open(dict_source, "r", encoding="utf8") as f:
             _vocabs = f.read().splitlines()
-            return Trie(_vocabs)
+            trie = Trie(_vocabs)
     elif isinstance(dict_source, Iterable):
         # Received a sequence type object of vocabs
-        return Trie(dict_source)
+        trie = Trie(dict_source)
+    elif isinstance(dict_source, Trie):
+        trie = dict_source
     else:
         raise TypeError(
-            "Type of dict_source must be either str (path to source file) or iterable"
+            "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
         )
 
+    return trie
+
 
 class Tokenizer:
     def __init__(
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index 31636e06b..f3ec1efb4 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -3,12 +3,21 @@
 Wrapper for deepcut Thai word segmentation
 """
 
-from typing import List
+from typing import List, Union
 
 import deepcut
 
+from marisa_trie import Trie
+
+
+def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
+    if not text:
+        return []
+
+    if custom_dict:
+        if isinstance(custom_dict, Trie):
+            custom_dict = list(custom_dict)
+
+        return deepcut.tokenize(text, custom_dict)
 
-def segment(text: str,dict_source:List[str]=None) -> List[str]:
-    if dict_source!=None:
-        return deepcut.tokenize(text, custom_dict=dict_source)
     return deepcut.tokenize(text)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 83ce495a1..db0bf889c 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -6,9 +6,12 @@
 https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
 """
 import re
+from typing import List
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 _FRONT_DEP_CHAR = [
     "ะ",
     "ั",
@@ -36,7 +39,7 @@
 
 
 class LongestMatchTokenizer(object):
-    def __init__(self, trie):
+    def __init__(self, trie: Trie):
         self.__trie = trie
 
     def __search_nonthai(self, text: str):
@@ -130,14 +133,17 @@ def __segment_text(self, text: str):
 
         return tokens
 
-    def tokenize(self, text):
+    def tokenize(self, text: str) -> List[str]:
         tokens = self.__segment_text(text)
         return tokens
 
 
-def segment(text, trie=None):
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     """ตัดคำภาษาไทยด้วยวิธี longest matching"""
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
+    if not text:
+        return []
+
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
 
-    return LongestMatchTokenizer(trie).tokenize(text)
+    return LongestMatchTokenizer(custom_dict).tokenize(text)
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index d161bdf4e..5d1238336 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -8,9 +8,12 @@
 """
 import re
 from collections import defaultdict
+from typing import List
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 
 class LatticeString(str):
     """
@@ -40,13 +43,14 @@ def __init__(self, value, multi=None, in_dict=True):
 _PAT_ENG = re.compile(_RE_ENG)
 
 
-def _multicut(text, trie=None):
+def _multicut(text: str, custom_dict: Trie = None):
     """
     ส่งคืน LatticeString คืนมาเป็นก้อนๆ
     """
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
+
     len_text = len(text)
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
     words_at = defaultdict(list)  # main data structure
 
     def serialize(p, p2):  # helper function
@@ -64,7 +68,7 @@ def serialize(p, p2):  # helper function
         p = min(q)
         q -= {p}  # q.pop, but for set
 
-        for w in trie.prefixes(text[p:]):
+        for w in custom_dict.prefixes(text[p:]):
             words_at[p].append(w)
             q.add(p + len(w))
 
@@ -80,7 +84,7 @@ def serialize(p, p2):  # helper function
                 i = p + m.span()[1]
             else:  # skip น้อยที่สุด ที่เป็นไปได้
                 for i in range(p, len_text):
-                    ww = trie.prefixes(text[i:])
+                    ww = custom_dict.prefixes(text[i:])
                     m = _PAT_ENG.match(text[i:])
                     if ww or m:
                         break
@@ -93,7 +97,7 @@ def serialize(p, p2):  # helper function
             q.add(i)
 
 
-def mmcut(text):
+def mmcut(text: str):
     res = []
     for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
@@ -101,7 +105,7 @@ def mmcut(text):
     return res
 
 
-def _combine(ww):
+def _combine(ww: str):
     if ww == []:
         yield ""
     else:
@@ -114,22 +118,23 @@ def _combine(ww):
                     yield m.replace("/", "|") + "|" + tail
 
 
-def segment(text, trie=None):
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
     if not text:
         return []
 
-    return list(_multicut(text, trie=trie))
+    return list(_multicut(text, custom_dict=custom_dict))
 
 
-def find_all_segment(text, trie=None):
+def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
     if not text:
         return []
 
-    ww = list(_multicut(text, trie=trie))
+    ww = list(_multicut(text, custom_dict=custom_dict))
+
     return list(_combine(ww))
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 066ff1017..88b766eea 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -13,6 +13,8 @@
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 from .tcc import tcc_pos
 
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
@@ -39,7 +41,7 @@ def bfs_paths_graph(graph, start, goal):
                 queue.append((next, path + [next]))
 
 
-def onecut(text: str, trie):
+def onecut(text: str, custom_dict: Trie):
     graph = defaultdict(list)  # main data structure
     allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc
 
@@ -48,7 +50,7 @@ def onecut(text: str, trie):
     while q[0] < len(text):
         p = heappop(q)
 
-        for w in trie.prefixes(text[p:]):
+        for w in custom_dict.prefixes(text[p:]):
             p_ = p + len(w)
             if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                 graph[p].append(p_)
@@ -74,7 +76,7 @@ def onecut(text: str, trie):
                     if i in allow_pos:  # ใช้ tcc ด้วย
                         ww = [
                             w
-                            for w in trie.prefixes(text[i:])
+                            for w in custom_dict.prefixes(text[i:])
                             if (i + len(w) in allow_pos)
                         ]
                         ww = [w for w in ww if not _PAT_TWOCHARS.match(w)]
@@ -90,12 +92,11 @@ def onecut(text: str, trie):
             heappush(q, i)
 
 
-# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
-def segment(text: str, trie=None) -> List[str]:
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     if not text:
         return []
 
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
 
-    return list(onecut(text, trie))
+    return list(onecut(text, custom_dict))

From e84822151151f44bb689a21f5cc6c387ca6364b0 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 15 Apr 2019 19:33:57 +0200
Subject: [PATCH 2/3] More test cases for dict_word_tokenize and deepcut

---
 tests/__init__.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index b2f7c711f..9a33a9902 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -29,6 +29,7 @@
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.named_entity import ThaiNameTagger
 from pythainlp.tokenize import (
+    DEFAULT_DICT_TRIE,
     FROZEN_DICT_TRIE,
     Tokenizer,
     dict_trie,
@@ -43,6 +44,7 @@
     tcc,
     word_tokenize,
 )
+from pythainlp.tokenize import deepcut as tokenize_deepcut
 from pythainlp.tokenize import pyicu as tokenize_pyicu
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.transliterate.ipa import trans_list, xsampa_list
@@ -305,6 +307,7 @@ def test_dict_word_tokenize(self):
                 "รถไฟฟ้ากรุงเทพBTSหูว์ค์",
                 custom_dict=FROZEN_DICT_TRIE,
                 engine="longest",
+                whitespaces=False,
             )
         )
         self.assertIsNotNone(
@@ -351,10 +354,15 @@ def test_word_tokenize_icu(self):
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
         )
 
-    # def test_word_tokenize_deepcut(self):
-    # self.assertEqual(deepcut.segment(None), [])
-    # self.assertEqual(deepcut.segment(""), [])
-    # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
+    def test_word_tokenize_deepcut(self):
+        self.assertEqual(tokenize_deepcut.segment(None), [])
+        self.assertEqual(tokenize_deepcut.segment(""), [])
+        self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", DEFAULT_DICT_TRIE))
+        self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"]))
+        self.assertIsNotNone(dict_word_tokenize("ทดสอบ", engine="deepcut"))
+        self.assertIsNotNone(
+            dict_word_tokenize("ทดสอบ", engine="deepcut", custom_dict=["ทด", "สอบ"])
+        )
 
     def test_word_tokenize_longest_matching(self):
         self.assertEqual(longest.segment(None), [])
@@ -405,9 +413,10 @@ def test_sent_tokenize(self):
         self.assertEqual(sent_tokenize("รักน้ำ  รักปลา  "), ["รักน้ำ", "รักปลา"])
 
     def test_subword_tokenize(self):
-        self.assertEqual(subword_tokenize(None), "")
-        self.assertEqual(subword_tokenize(""), "")
-        self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร"))
+        self.assertEqual(subword_tokenize(None), [])
+        self.assertEqual(subword_tokenize(""), [])
+        self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"))
+        self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="etcc"))
 
     def test_syllable_tokenize(self):
         self.assertEqual(syllable_tokenize(None), [])

From a5525c374425a9062fda489911c3018211e83f77 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 15 Apr 2019 19:47:22 +0200
Subject: [PATCH 3/3] if input is empty, subword_tokenize() should return empty
 list.

---
 pythainlp/tokenize/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 6587e9bb7..d8cc6bafe 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -153,7 +153,7 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
     :return: a list of tokenized strings.
     """
     if not text:
-        return ""
+        return []
 
     if engine == "etcc":
         from .etcc import segment