From 797caefde80ab6513861e228a8aa1e379396d274 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 14:42:19 +0700
Subject: [PATCH 1/6] deepcut & dict_word_tokenize

---
 pythainlp/tokenize/__init__.py | 8 ++++++++
 pythainlp/tokenize/deepcut.py  | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 94e952fac..8e3756429 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,6 +9,7 @@
 
 from marisa_trie import Trie
 
+DICT_LIST=thai_words()
 DEFAULT_DICT_TRIE = Trie(thai_words())
 FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))
 
@@ -80,6 +81,7 @@ def dict_word_tokenize(
         >>> dict_word_tokenize("แมวดีดีแมว", trie)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
+    global DICT_LIST
 
     if not text:
         return []
@@ -90,6 +92,9 @@ def dict_word_tokenize(
         from .longest import segment
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
+    elif engine == "deepcut":
+        from .deepcut import segment
+        return segment(text,DICT_LIST)
     else:  # default, use "newmm" engine
         from .newmm import segment
 
@@ -171,14 +176,17 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
     :param string/list dict_source: a list of vocaburaries or a path to source file
     :return: a trie created from a dictionary input
     """
+    global DICT_LIST
 
     if type(dict_source) is str:
         # Receive a file path of the dict to read
         with open(dict_source, "r", encoding="utf8") as f:
             _vocabs = f.read().splitlines()
+            DICT_LIST=_vocabs
             return Trie(_vocabs)
     elif isinstance(dict_source, Iterable):
         # Received a sequence type object of vocabs
+        _vocabs=dict_source
         return Trie(dict_source)
     else:
         raise TypeError(
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index a3844c2f3..31636e06b 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -8,5 +8,7 @@
 import deepcut
 
 
-def segment(text: str) -> List[str]:
+def segment(text: str,dict_source:List[str]=None) -> List[str]:
+    if dict_source!=None:
+        return deepcut.tokenize(text, custom_dict=dict_source)
     return deepcut.tokenize(text)

From 6e38de0dab1b6112ee7d355a41e69226ce2aade7 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 14:57:34 +0700
Subject: [PATCH 2/6] update dict trie from Trie to (Trie,List)

---
 pythainlp/tokenize/__init__.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 8e3756429..e17579c5a 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -81,7 +81,6 @@ def dict_word_tokenize(
         >>> dict_word_tokenize("แมวดีดีแมว", trie)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
-    global DICT_LIST
 
     if not text:
         return []
@@ -94,11 +93,11 @@ def dict_word_tokenize(
         from .multi_cut import segment
     elif engine == "deepcut":
         from .deepcut import segment
-        return segment(text,DICT_LIST)
+        return segment(text,custom_dict[1])
     else:  # default, use "newmm" engine
         from .newmm import segment
 
-    return segment(text, custom_dict)
+    return segment(text, custom_dict[0])
 
 
 def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
@@ -176,18 +175,15 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
     :param string/list dict_source: a list of vocaburaries or a path to source file
     :return: a trie created from a dictionary input
     """
-    global DICT_LIST
 
     if type(dict_source) is str:
         # Receive a file path of the dict to read
         with open(dict_source, "r", encoding="utf8") as f:
             _vocabs = f.read().splitlines()
-            DICT_LIST=_vocabs
-            return Trie(_vocabs)
+            return (Trie(_vocabs),_vocabs)
     elif isinstance(dict_source, Iterable):
         # Received a sequence type object of vocabs
-        _vocabs=dict_source
-        return Trie(dict_source)
+        return (Trie(dict_source),dict_source)
     else:
         raise TypeError(
             "Type of dict_source must be either str (path to source file) or iterable"

From 1a3338b74acf8276ede5753d4d55a1d354d76252 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 15:09:11 +0700
Subject: [PATCH 3/6] fix bug test

---
 pythainlp/tokenize/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index e17579c5a..f32f48f54 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,7 +9,6 @@
 
 from marisa_trie import Trie
 
-DICT_LIST=thai_words()
 DEFAULT_DICT_TRIE = Trie(thai_words())
 FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))
 
@@ -96,8 +95,10 @@ def dict_word_tokenize(
         return segment(text,custom_dict[1])
     else:  # default, use "newmm" engine
         from .newmm import segment
-
-    return segment(text, custom_dict[0])
+    if type(custom_dict) is tuple:
+        return segment(text, custom_dict[0])
+    else:
+        return segment(text, custom_dict)
 
 
 def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:

From 0b1817f4d00a8de5ebadfc226e075fddf55e7cea Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 15:21:25 +0700
Subject: [PATCH 4/6] Trie to list for deepcut

---
 pythainlp/tokenize/__init__.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index f32f48f54..82566f5c9 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -92,13 +92,10 @@ def dict_word_tokenize(
         from .multi_cut import segment
     elif engine == "deepcut":
         from .deepcut import segment
-        return segment(text,custom_dict[1])
+        return segment(text,list(custom_dict))
     else:  # default, use "newmm" engine
         from .newmm import segment
-    if type(custom_dict) is tuple:
-        return segment(text, custom_dict[0])
-    else:
-        return segment(text, custom_dict)
+    return segment(text, custom_dict)
 
 
 def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
@@ -181,10 +178,10 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
         # Receive a file path of the dict to read
         with open(dict_source, "r", encoding="utf8") as f:
             _vocabs = f.read().splitlines()
-            return (Trie(_vocabs),_vocabs)
+            return Trie(_vocabs)
     elif isinstance(dict_source, Iterable):
         # Received a sequence type object of vocabs
-        return (Trie(dict_source),dict_source)
+        return Trie(dict_source)
     else:
         raise TypeError(
             "Type of dict_source must be either str (path to source file) or iterable"

From 1b5109e3d7079ef91a1935d68830c40ef8791044 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 15:24:58 +0700
Subject: [PATCH 5/6] update docs

---
 pythainlp/tokenize/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 82566f5c9..40ae585c7 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -71,7 +71,7 @@ def dict_word_tokenize(
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
     :param str text: text to be tokenized
     :param dict custom_dict: a dictionary trie
-    :param str engine: choose between different options of engine to token (newmm, longest)
+    :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut)
     :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie

From 3ab9dc40894fb0c6d867050b8f3076466617053a Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@kkumail.com>
Date: Mon, 15 Apr 2019 15:40:46 +0700
Subject: [PATCH 6/6] =?UTF-8?q?del=20=E0=B9=92,=E0=B9=95=E0=B9=94=E0=B9=90?=
 =?UTF-8?q?=20=E0=B8=A3=E0=B8=B2=E0=B8=A2=E0=B8=81=E0=B8=B2=E0=B8=A3=20fro?=
 =?UTF-8?q?m=20=20words=5Fth.txt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pythainlp/corpus/words_th.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt
index 4e6259387..0fa96af67 100755
--- a/pythainlp/corpus/words_th.txt
+++ b/pythainlp/corpus/words_th.txt
@@ -61186,7 +61186,6 @@
 แอกน้อย
 แอด ๆ
 แอบ ๆ
-๒,๕๔๐ รายการ
 โอ้กอ้าก
 โอฆ
 โอฆชล