PyThaiNLP · bact · Apr 17, 2019 · Apr 15, 2019 · Apr 15, 2019 · Apr 15, 2019
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -47,7 +47,7 @@ def word_tokenize(
         from .newmm import segment as segment_
 
         def segment(text):
-            return segment_(text, trie=FROZEN_DICT_TRIE)
+            return segment_(text, custom_dict=FROZEN_DICT_TRIE)
 
     elif engine == "icu":
         from .pyicu import segment
@@ -58,20 +58,26 @@ def segment(text):
     else:  # default, use "newmm" engine
         from .newmm import segment
 
-    if not whitespaces:
-        return [token.strip(" ") for token in segment(text) if token.strip(" ")]
+    segments = segment(text)
 
-    return segment(text)
+    if whitespaces:
+        return segments
+
+    return [token.strip(" ") for token in segments if token.strip(" ")]
 
 
 def dict_word_tokenize(
-    text: str, custom_dict: Trie, engine: str = "newmm"
+    text: str,
+    custom_dict: Union[Trie, Iterable[str], str] = DEFAULT_DICT_TRIE,
+    engine: str = "newmm",
+    whitespaces: bool = True,
 ) -> List[str]:
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
     :param str text: text to be tokenized
-    :param dict custom_dict: a dictionary trie
-    :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut)
+    :param dict custom_dict: a dictionary trie, or an iterable of words, or a string of dictionary path
+    :param str engine: choose between different options of engine to token (newmm [default], mm, longest, and deepcut)
+    :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai
     :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie
@@ -86,16 +92,32 @@ def dict_word_tokenize(
 
     if engine == "newmm" or engine == "onecut":
         from .newmm import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
+
+        custom_dict = dict_trie(custom_dict)
     elif engine == "deepcut":
         from .deepcut import segment
-        return segment(text,list(custom_dict))
+
+        if not isinstance(custom_dict, List) and not isinstance(custom_dict, str):
+            custom_dict = list(custom_dict)
     else:  # default, use "newmm" engine
         from .newmm import segment
-    return segment(text, custom_dict)
+
+        custom_dict = dict_trie(custom_dict)
+
+    segments = segment(text, custom_dict)
+
+    if whitespaces:
+        return segments
+
+    return [token.strip(" ") for token in segments if token.strip(" ")]
 
 
 def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
@@ -131,15 +153,12 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
     :return: a list of tokenized strings.
     """
     if not text:
-        return ""
+        return []
 
     if engine == "etcc":
         from .etcc import segment
-
-        return segment(text)
-
-    # default is "tcc"
-    from .tcc import segment
+    else:  # default
+        from .tcc import segment
 
     return segment(text)
 
@@ -164,7 +183,7 @@ def syllable_tokenize(text: str) -> List[str]:
     return tokens
 
 
-def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
+def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
     """
     Create a dict trie which will be used for word_tokenize() function.
     For more information on the trie data structure,
@@ -173,20 +192,25 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
     :param string/list dict_source: a list of vocaburaries or a path to source file
     :return: a trie created from a dictionary input
     """
+    trie = None
 
     if type(dict_source) is str:
         # Receive a file path of the dict to read
         with open(dict_source, "r", encoding="utf8") as f:
             _vocabs = f.read().splitlines()
-            return Trie(_vocabs)
+            trie = Trie(_vocabs)
     elif isinstance(dict_source, Iterable):
         # Received a sequence type object of vocabs
-        return Trie(dict_source)
+        trie = Trie(dict_source)
+    elif isinstance(dict_source, Trie):
+        trie = dict_source
     else:
         raise TypeError(
-            "Type of dict_source must be either str (path to source file) or iterable"
+            "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
         )
 
+    return trie
+
 
 class Tokenizer:
     def __init__(

diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
@@ -3,12 +3,21 @@
 Wrapper for deepcut Thai word segmentation
 """
 
-from typing import List
+from typing import List, Union
 
 import deepcut
 
+from marisa_trie import Trie
+
+
+def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
+    if not text:
+        return []
+
+    if custom_dict:
+        if isinstance(custom_dict, Trie):
+            custom_dict = list(custom_dict)
+
+        return deepcut.tokenize(text, custom_dict)
 
-def segment(text: str,dict_source:List[str]=None) -> List[str]:
-    if dict_source!=None:
-        return deepcut.tokenize(text, custom_dict=dict_source)
     return deepcut.tokenize(text)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -6,9 +6,12 @@
 https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
 """
 import re
+from typing import List
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 _FRONT_DEP_CHAR = [
     "ะ",
     "ั",
@@ -36,7 +39,7 @@
 
 
 class LongestMatchTokenizer(object):
-    def __init__(self, trie):
+    def __init__(self, trie: Trie):
         self.__trie = trie
 
     def __search_nonthai(self, text: str):
@@ -130,14 +133,17 @@ def __segment_text(self, text: str):
 
         return tokens
 
-    def tokenize(self, text):
+    def tokenize(self, text: str) -> List[str]:
         tokens = self.__segment_text(text)
         return tokens
 
 
-def segment(text, trie=None):
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     """ตัดคำภาษาไทยด้วยวิธี longest matching"""
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
+    if not text:
+        return []
+
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
 
-    return LongestMatchTokenizer(trie).tokenize(text)
+    return LongestMatchTokenizer(custom_dict).tokenize(text)
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -8,9 +8,12 @@
 """
 import re
 from collections import defaultdict
+from typing import List
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 
 class LatticeString(str):
     """
@@ -40,13 +43,14 @@ def __init__(self, value, multi=None, in_dict=True):
 _PAT_ENG = re.compile(_RE_ENG)
 
 
-def _multicut(text, trie=None):
+def _multicut(text: str, custom_dict: Trie = None):
     """
     ส่งคืน LatticeString คืนมาเป็นก้อนๆ
     """
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
+
     len_text = len(text)
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
     words_at = defaultdict(list)  # main data structure
 
     def serialize(p, p2):  # helper function
@@ -64,7 +68,7 @@ def serialize(p, p2):  # helper function
         p = min(q)
         q -= {p}  # q.pop, but for set
 
-        for w in trie.prefixes(text[p:]):
+        for w in custom_dict.prefixes(text[p:]):
             words_at[p].append(w)
             q.add(p + len(w))
 
@@ -80,7 +84,7 @@ def serialize(p, p2):  # helper function
                 i = p + m.span()[1]
             else:  # skip น้อยที่สุด ที่เป็นไปได้
                 for i in range(p, len_text):
-                    ww = trie.prefixes(text[i:])
+                    ww = custom_dict.prefixes(text[i:])
                     m = _PAT_ENG.match(text[i:])
                     if ww or m:
                         break
@@ -93,15 +97,15 @@ def serialize(p, p2):  # helper function
             q.add(i)
 
 
-def mmcut(text):
+def mmcut(text: str):
     res = []
     for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
         res.extend(mm.split("/"))
     return res
 
 
-def _combine(ww):
+def _combine(ww: str):
     if ww == []:
         yield ""
     else:
@@ -114,22 +118,23 @@ def _combine(ww):
                     yield m.replace("/", "|") + "|" + tail
 
 
-def segment(text, trie=None):
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
     if not text:
         return []
 
-    return list(_multicut(text, trie=trie))
+    return list(_multicut(text, custom_dict=custom_dict))
 
 
-def find_all_segment(text, trie=None):
+def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
     if not text:
         return []
 
-    ww = list(_multicut(text, trie=trie))
+    ww = list(_multicut(text, custom_dict=custom_dict))
+
     return list(_combine(ww))
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -13,6 +13,8 @@
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from marisa_trie import Trie
+
 from .tcc import tcc_pos
 
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
@@ -39,7 +41,7 @@ def bfs_paths_graph(graph, start, goal):
                 queue.append((next, path + [next]))
 
 
-def onecut(text: str, trie):
+def onecut(text: str, custom_dict: Trie):
     graph = defaultdict(list)  # main data structure
     allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc
 
@@ -48,7 +50,7 @@ def onecut(text: str, trie):
     while q[0] < len(text):
         p = heappop(q)
 
-        for w in trie.prefixes(text[p:]):
+        for w in custom_dict.prefixes(text[p:]):
             p_ = p + len(w)
             if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                 graph[p].append(p_)
@@ -74,7 +76,7 @@ def onecut(text: str, trie):
                     if i in allow_pos:  # ใช้ tcc ด้วย
                         ww = [
                             w
-                            for w in trie.prefixes(text[i:])
+                            for w in custom_dict.prefixes(text[i:])
                             if (i + len(w) in allow_pos)
                         ]
                         ww = [w for w in ww if not _PAT_TWOCHARS.match(w)]
@@ -90,12 +92,11 @@ def onecut(text: str, trie):
             heappush(q, i)
 
 
-# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
-def segment(text: str, trie=None) -> List[str]:
+def segment(text: str, custom_dict: Trie = None) -> List[str]:
     if not text:
         return []
 
-    if not trie:
-        trie = DEFAULT_DICT_TRIE
+    if not custom_dict:
+        custom_dict = DEFAULT_DICT_TRIE
 
-    return list(onecut(text, trie))
+    return list(onecut(text, custom_dict))