From 87c3b2c5dc842d36655a1607b61f6106fa7c19d9 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Thu, 1 Mar 2018 15:38:01 +0700
Subject: [PATCH 01/17] Improved custom dict tokenization

---
 pythainlp/__init__.py          |   2 +-
 pythainlp/tokenize/__init__.py | 173 +++++++++++++++++++--------------
 pythainlp/tokenize/newmm.py    |  14 +--
 3 files changed, 110 insertions(+), 79 deletions(-)

diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
index b8c46b251..c50ee9d27 100644
--- a/pythainlp/__init__.py
+++ b/pythainlp/__init__.py
@@ -1,6 +1,6 @@
 ﻿# -*- coding: utf-8 -*-
 from __future__ import absolute_import
-__version__ = 1.5
+__version__ = 1.6
 import six
 if six.PY3:
 	"""
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 6939b8e3a..44f054119 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -5,6 +5,34 @@
 import codecs
 from six.moves import zip
 from pythainlp.corpus.thaisyllable import get_data
+from pythainlp.corpus.thaiword import get_data as get_dict
+from marisa_trie import Trie
+
+class Tokenizer:
+	def __init__(self, custom_dict=None):
+		"""
+		Initialize tokenizer object
+		
+		Keyword arguments:
+		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)
+
+		Object variables:
+		trie_dict -- a trie to use in tokenizing engines
+		"""
+		if custom_dict:
+			if type(custom_dict) is list:
+				self.trie_dict = Trie(custom_dict)
+			elif type(custom_dict) is str:
+				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
+					vocabs = [word.rstrip() for word in f.readlines()]
+				self.trie_dict = Trie(vocabs)
+		else:
+			self.trie_dict = Trie(get_dict())
+	
+	def word_tokenize(self, text, engine='newmm'):
+		from .newmm import mmcut as segment
+		return segment(text, data=self.trie_dict)
+
 def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	'''
 	dict_word_tokenize(text,file,engine)
@@ -13,9 +41,9 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	file คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 	engine คือ เครื่องมือตัดคำ
 	- newmm ตัดคำด้วย newmm
-    - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	- mm ตัดคำด้วย mm
-    - longest-matching ตัดคำโดยใช้ longest matching
+	- longest-matching ตัดคำโดยใช้ longest matching
 	data_type คือ ชนิดข้อมูล
 	- file คือ ไฟล์ข้อมูล
 	- list คือ ข้อมูลที่อยู่ใน list
@@ -24,7 +52,6 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	if data_type=='file':
 		with codecs.open(file, 'r',encoding='utf8') as f:
 			lines = f.read().splitlines()
-		f.close()
 	elif data_type=='list':
 		lines = data
 	if engine=="newmm":
@@ -36,6 +63,7 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
 	return segment(text,data=lines)
+
 def word_tokenize(text,engine='newmm'):
 	"""
 	ระบบตัดคำภาษาไทย
@@ -53,53 +81,54 @@ def word_tokenize(text,engine='newmm'):
 	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	"""
 	if engine=='icu':
-    		'''
-			ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
-    		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
-			ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
-    		from .pyicu import segment
+		'''
+		ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
+		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
+		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
+		from .pyicu import segment
 	elif engine=='dict':
-    		'''
-			ใช้ dicu ในการตัดคำไทย
-			จะคืนค่า False หากไม่สามารถตัดคำไทย
-			'''
-    		from .dictsegment import segment
+		'''
+		ใช้ dicu ในการตัดคำไทย
+		จะคืนค่า False หากไม่สามารถตัดคำไทย
+		'''
+		from .dictsegment import segment
 	elif engine=='mm':
-    		'''
-			ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
-			'''
-    		from .mm import segment
+		'''
+		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
+		'''
+		from .mm import segment
 	elif engine=='newmm':
-    		'''
-			ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
-			'''
-    		from .newmm import mmcut as segment
+		'''
+		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
+		'''
+		from .newmm import mmcut as segment
 	elif engine=='longest-matching':
-    		'''
-			ใช้ Longest matching ในการตัดคำ
-			'''
-    		from .longest import segment
+		'''
+		ใช้ Longest matching ในการตัดคำ
+		'''
+		from .longest import segment
 	elif engine=='pylexto':
-    		'''
-			ใช้ LexTo ในการตัดคำ
-			'''
-    		from .pylexto import segment
+		'''
+		ใช้ LexTo ในการตัดคำ
+		'''
+		from .pylexto import segment
 	elif engine=='deepcut':
-    		'''
-			ใช้ Deep Neural Network ในการตัดคำภาษาไทย
-			'''
-    		from .deepcut import segment
+		'''
+		ใช้ Deep Neural Network ในการตัดคำภาษาไทย
+		'''
+		from .deepcut import segment
 	elif engine=='cutkum':
-    		'''
-			ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
-			'''
-    		from .cutkum import segment
+		'''
+		ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
+		'''
+		from .cutkum import segment
 	elif engine=='wordcutpy':
-    		'''
-			wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
-			'''
-    		from .wordcutpy import segment
+		'''
+		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
+		from .wordcutpy import segment
 	return segment(text)
+
 def sent_tokenize(text,engine='whitespace+newline'):
 	'''
 	sent_tokenize(text,engine='whitespace+newline')
@@ -119,37 +148,37 @@ def wordpunct_tokenize(text):
 def WhitespaceTokenizer(text):
 	return nltk.tokenize.WhitespaceTokenizer().tokenize(text)
 def isthai(text,check_all=False):
-    """
-    สำหรับเช็คว่าเป็นตัวอักษรภาษาไทยหรือไม่
-    isthai(text,check_all=False)
-    text คือ ข้อความหรือ list ตัวอักษร
-    check_all สำหรับส่งคืนค่า True หรือ False เช็คทุกตัวอักษร
+	"""
+	สำหรับเช็คว่าเป็นตัวอักษรภาษาไทยหรือไม่
+	isthai(text,check_all=False)
+	text คือ ข้อความหรือ list ตัวอักษร
+	check_all สำหรับส่งคืนค่า True หรือ False เช็คทุกตัวอักษร
 
-    การส่งคืนค่า
-    {'thai':% อักษรภาษาไทย,'check_all':tuple โดยจะเป็น (ตัวอักษร,True หรือ False)}
-    """
-    listext=list(text)
-    i=0
-    num_isthai=0
-    if check_all==True:
-        listthai=[]
-    while i<len(listext):
-        cVal = ord(listext[i])
-        if(cVal >= 3584 and cVal <= 3711):
-            num_isthai+=1
-            if check_all==True:
-                listthai.append(True)
-        else:
-            if check_all==True:
-                listthai.append(False)
-        i+=1
-    thai=(num_isthai/len(listext))*100
-    if check_all==True:
-        dictthai=tuple(zip(listext,listthai))
-        data= {'thai':thai,'check_all':dictthai}
-    else:
-        data= {'thai':thai}
-    return data
+	การส่งคืนค่า
+	{'thai':% อักษรภาษาไทย,'check_all':tuple โดยจะเป็น (ตัวอักษร,True หรือ False)}
+	"""
+	listext=list(text)
+	i=0
+	num_isthai=0
+	if check_all==True:
+		listthai=[]
+	while i<len(listext):
+		cVal = ord(listext[i])
+		if(cVal >= 3584 and cVal <= 3711):
+			num_isthai+=1
+			if check_all==True:
+				listthai.append(True)
+		else:
+			if check_all==True:
+				listthai.append(False)
+		i+=1
+	thai=(num_isthai/len(listext))*100
+	if check_all==True:
+		dictthai=tuple(zip(listext,listthai))
+		data= {'thai':thai,'check_all':dictthai}
+	else:
+		data= {'thai':thai}
+	return data
 def syllable_tokenize(text1):
 	"""
 	syllable_tokenize(text)
@@ -166,4 +195,4 @@ def syllable_tokenize(text1):
 			i+=1
 	else:
 		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
-	return data
+	return data
\ No newline at end of file
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 64f908a85..1f9b6df39 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -83,10 +83,12 @@ def bfs_paths_graph(graph, start, goal):
       else:
         queue.append((next, path+[next]))
 
-def onecut(text, data=['']):
-  if(data != ['']):
-      trie = Trie(data)
-  else:
+def onecut(text, trie=None):
+#   if(data != ['']):
+#       trie = Trie(data)
+#   else:
+#       trie = THAI_WORDS
+  if not trie:
       trie = THAI_WORDS
   graph = defaultdict(list)  # main data structure
   allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc
@@ -135,5 +137,5 @@ def onecut(text, data=['']):
 # ช่วยให้ไม่ต้องพิมพ์ยาวๆ
 
 
-def mmcut(text, data=['']):
-    return list(onecut(text, data=data))
+def mmcut(text, data=None):
+    return list(onecut(text, trie=data))
\ No newline at end of file

From 6c8fa6721f7ca2eab538c4daa9e48d16ee235705 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Thu, 1 Mar 2018 16:01:55 +0700
Subject: [PATCH 02/17] made dict_word_tokenize() to work with new edited
 newmm.py

---
 pythainlp/tokenize/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 44f054119..88a22d493 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -56,6 +56,8 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 		lines = data
 	if engine=="newmm":
 		from .newmm import mmcut as segment
+		trie = Trie(lines)
+		return segment(text, data=trie)
 	elif engine=="mm":
 		from .mm import segment
 	elif engine=='longest-matching':

From 504a0f3d1d49b691d4ad5028b3985478dd9345ae Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Sat, 3 Mar 2018 14:59:40 +0700
Subject: [PATCH 03/17] declared global custom_dict_trie and vocabs to be used
 with dict_word_tokenize()

---
 pythainlp/tokenize/__init__.py | 75 +++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 88a22d493..d7ac3bbfd 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -8,32 +8,10 @@
 from pythainlp.corpus.thaiword import get_data as get_dict
 from marisa_trie import Trie
 
-class Tokenizer:
-	def __init__(self, custom_dict=None):
-		"""
-		Initialize tokenizer object
-		
-		Keyword arguments:
-		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)
+VOCABS = list()
+CUSTOM_DICT_TRIE = None
 
-		Object variables:
-		trie_dict -- a trie to use in tokenizing engines
-		"""
-		if custom_dict:
-			if type(custom_dict) is list:
-				self.trie_dict = Trie(custom_dict)
-			elif type(custom_dict) is str:
-				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
-					vocabs = [word.rstrip() for word in f.readlines()]
-				self.trie_dict = Trie(vocabs)
-		else:
-			self.trie_dict = Trie(get_dict())
-	
-	def word_tokenize(self, text, engine='newmm'):
-		from .newmm import mmcut as segment
-		return segment(text, data=self.trie_dict)
-
-def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
+def dict_word_tokenize(text, file='', engine="newmm", data=[''], data_type="file"):
 	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
@@ -49,22 +27,26 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	- list คือ ข้อมูลที่อยู่ใน list
 	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
 	'''
-	if data_type=='file':
-		with codecs.open(file, 'r',encoding='utf8') as f:
-			lines = f.read().splitlines()
-	elif data_type=='list':
-		lines = data
+	from pythainlp.tokenize import VOCABS, CUSTOM_DICT_TRIE
+	global VOCABS, CUSTOM_DICT_TRIE # Unable to replace value if 'global' is not declared
+	if not VOCABS:
+		if data_type=='file':
+			with codecs.open(file, 'r',encoding='utf8') as f:
+				VOCABS = f.read().splitlines()
+				CUSTOM_DICT_TRIE = Trie(VOCABS)
+		elif data_type=='list':
+			VOCABS = data
+			CUSTOM_DICT_TRIE = Trie(VOCABS)
 	if engine=="newmm":
 		from .newmm import mmcut as segment
-		trie = Trie(lines)
-		return segment(text, data=trie)
+		return segment(text, data=CUSTOM_DICT_TRIE)
 	elif engine=="mm":
 		from .mm import segment
 	elif engine=='longest-matching':
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-	return segment(text,data=lines)
+	return segment(text, data=VOCABS)
 
 def word_tokenize(text,engine='newmm'):
 	"""
@@ -197,4 +179,29 @@ def syllable_tokenize(text1):
 			i+=1
 	else:
 		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
-	return data
\ No newline at end of file
+	return data
+
+class Tokenizer:
+	def __init__(self, custom_dict=None):
+		"""
+		Initialize tokenizer object
+		
+		Keyword arguments:
+		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)
+
+		Object variables:
+		trie_dict -- a trie to use in tokenizing engines
+		"""
+		if custom_dict:
+			if type(custom_dict) is list:
+				self.trie_dict = Trie(custom_dict)
+			elif type(custom_dict) is str:
+				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
+					vocabs = f.read().splitlines()
+				self.trie_dict = Trie(vocabs)
+		else:
+			self.trie_dict = Trie(get_dict())
+	
+	def word_tokenize(self, text, engine='newmm'):
+		from .newmm import mmcut as segment
+		return segment(text, data=self.trie_dict)
\ No newline at end of file

From ebd26a94dad2fe5f0b2e0902ce47fadb46d6dd02 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Tue, 6 Mar 2018 22:56:10 +0700
Subject: [PATCH 04/17] temp

---
 pythainlp/tokenize/__init__.py | 33 ++++++++++++++++++---------------
 pythainlp/tokenize/newmm.py    | 24 ++++++++----------------
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index d7ac3bbfd..30a8f7f89 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -4,14 +4,15 @@
 import re
 import codecs
 from six.moves import zip
+# from pythainlp.tools import file_trie
 from pythainlp.corpus.thaisyllable import get_data
 from pythainlp.corpus.thaiword import get_data as get_dict
 from marisa_trie import Trie
 
-VOCABS = list()
-CUSTOM_DICT_TRIE = None
+CUSTOM_DICT_INITIALIZED = False
+DICT_TRIE = None #file_trie(data="old")
 
-def dict_word_tokenize(text, file='', engine="newmm", data=[''], data_type="file"):
+def dict_word_tokenize(text, engine="newmm", custom_dict_source=None):
 	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
@@ -27,26 +28,28 @@ def dict_word_tokenize(text, file='', engine="newmm", data=[''], data_type="file
 	- list คือ ข้อมูลที่อยู่ใน list
 	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
 	'''
-	from pythainlp.tokenize import VOCABS, CUSTOM_DICT_TRIE
-	global VOCABS, CUSTOM_DICT_TRIE # Unable to replace value if 'global' is not declared
-	if not VOCABS:
-		if data_type=='file':
-			with codecs.open(file, 'r',encoding='utf8') as f:
-				VOCABS = f.read().splitlines()
-				CUSTOM_DICT_TRIE = Trie(VOCABS)
-		elif data_type=='list':
-			VOCABS = data
-			CUSTOM_DICT_TRIE = Trie(VOCABS)
+	from pythainlp.tokenize import CUSTOM_DICT_INITIALIZED, DICT_TRIE
+	global CUSTOM_DICT_INITIALIZED, DICT_TRIE # Unable to replace value if 'global' is not declared
+	if custom_dict_source:
+		if not CUSTOM_DICT_INITIALIZED:
+			# Replace the default trie with a custom dict trie
+			if type(custom_dict_source) is str:
+				# Receive a file path of the custom dict to read
+				with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
+					_vocabs = f.read().splitlines()
+					DICT_TRIE = Trie(_vocabs)
+			elif isinstance(custom_dict_source, (list, tuple, set)):
+				# Received a sequence type object of vocabs
+				DICT_TRIE = Trie(custom_dict_source)
 	if engine=="newmm":
 		from .newmm import mmcut as segment
-		return segment(text, data=CUSTOM_DICT_TRIE)
 	elif engine=="mm":
 		from .mm import segment
 	elif engine=='longest-matching':
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-	return segment(text, data=VOCABS)
+	return segment(text)
 
 def word_tokenize(text,engine='newmm'):
 	"""
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 1f9b6df39..8ceda1b96 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -9,9 +9,8 @@
 import re
 from collections import defaultdict
 from heapq import heappush, heappop  # for priority queue
-from marisa_trie import Trie
 from pythainlp.corpus.thaiword import get_data  # ดึงข้อมูลรายการคำในภาษาไทย
-
+# from pythainlp.tokenize import DICT_TRIE
 
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
 pat_eng = re.compile(r'''(?x)
@@ -49,8 +48,8 @@
 [เ-ไ]ct
 """.replace('c', '[ก-ฮ]').replace('t', '[่-๋]?').split()
 
-THAI_WORDS = Trie(get_data())
-
+from pythainlp.tools import file_trie
+DICT_TRIE = file_trie(data='old')
 
 def tcc(w):
     p = 0
@@ -83,13 +82,7 @@ def bfs_paths_graph(graph, start, goal):
       else:
         queue.append((next, path+[next]))
 
-def onecut(text, trie=None):
-#   if(data != ['']):
-#       trie = Trie(data)
-#   else:
-#       trie = THAI_WORDS
-  if not trie:
-      trie = THAI_WORDS
+def onecut(text):
   graph = defaultdict(list)  # main data structure
   allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc
   
@@ -98,7 +91,7 @@ def onecut(text, trie=None):
   while q[0] < len(text):
       p = heappop(q)
 
-      for w in trie.prefixes(text[p:]):
+      for w in DICT_TRIE.prefixes(text[p:]):
           p_ = p + len(w)
           if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
             graph[p].append(p_)
@@ -122,7 +115,7 @@ def onecut(text, trie=None):
           else: # skip น้อยที่สุด ที่เป็นไปได้
               for i in range(p+1, len(text)):
                   if i in allow_pos:   # ใช้ tcc ด้วย
-                      ww = [w for w in trie.prefixes(text[i:]) if (i+len(w) in allow_pos)]
+                      ww = [w for w in DICT_TRIE.prefixes(text[i:]) if (i+len(w) in allow_pos)]
                       m = pat_eng.match(text[i:])
                       if ww or m:
                           break
@@ -136,6 +129,5 @@ def onecut(text, trie=None):
 
 # ช่วยให้ไม่ต้องพิมพ์ยาวๆ
 
-
-def mmcut(text, data=None):
-    return list(onecut(text, trie=data))
\ No newline at end of file
+def mmcut(text):
+    return list(onecut(text))
\ No newline at end of file

From 38ca4bc79aeb845f4c3a5db869ab5baca2a69eb1 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 22:32:32 +0700
Subject: [PATCH 05/17] added create_custom_dict_trie() and modified
 word_tokenize() to use new method of using custom dict to tokenize words

---
 pythainlp/tokenize/__init__.py  | 72 +++++++++++++++++++++++----------
 pythainlp/tokenize/longest.py   | 16 ++------
 pythainlp/tokenize/mm.py        | 17 +++-----
 pythainlp/tokenize/newmm.py     | 15 +++----
 pythainlp/tokenize/wordcutpy.py | 10 ++---
 5 files changed, 70 insertions(+), 60 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 30a8f7f89..f8f3ee459 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -4,15 +4,13 @@
 import re
 import codecs
 from six.moves import zip
-# from pythainlp.tools import file_trie
 from pythainlp.corpus.thaisyllable import get_data
 from pythainlp.corpus.thaiword import get_data as get_dict
 from marisa_trie import Trie
 
-CUSTOM_DICT_INITIALIZED = False
-DICT_TRIE = None #file_trie(data="old")
+DEFAULT_DICT_TRIE = Trie(get_dict())
 
-def dict_word_tokenize(text, engine="newmm", custom_dict_source=None):
+def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
@@ -20,27 +18,20 @@ def dict_word_tokenize(text, engine="newmm", custom_dict_source=None):
 	file คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 	engine คือ เครื่องมือตัดคำ
 	- newmm ตัดคำด้วย newmm
-	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+    - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	- mm ตัดคำด้วย mm
-	- longest-matching ตัดคำโดยใช้ longest matching
+    - longest-matching ตัดคำโดยใช้ longest matching
 	data_type คือ ชนิดข้อมูล
 	- file คือ ไฟล์ข้อมูล
 	- list คือ ข้อมูลที่อยู่ใน list
 	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
 	'''
-	from pythainlp.tokenize import CUSTOM_DICT_INITIALIZED, DICT_TRIE
-	global CUSTOM_DICT_INITIALIZED, DICT_TRIE # Unable to replace value if 'global' is not declared
-	if custom_dict_source:
-		if not CUSTOM_DICT_INITIALIZED:
-			# Replace the default trie with a custom dict trie
-			if type(custom_dict_source) is str:
-				# Receive a file path of the custom dict to read
-				with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
-					_vocabs = f.read().splitlines()
-					DICT_TRIE = Trie(_vocabs)
-			elif isinstance(custom_dict_source, (list, tuple, set)):
-				# Received a sequence type object of vocabs
-				DICT_TRIE = Trie(custom_dict_source)
+	if data_type=='file':
+		with codecs.open(file, 'r',encoding='utf8') as f:
+			lines = f.read().splitlines()
+		f.close()
+	elif data_type=='list':
+		lines = data
 	if engine=="newmm":
 		from .newmm import mmcut as segment
 	elif engine=="mm":
@@ -49,9 +40,9 @@ def dict_word_tokenize(text, engine="newmm", custom_dict_source=None):
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-	return segment(text)
+	return segment(text,data=lines)
 
-def word_tokenize(text,engine='newmm'):
+def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 	"""
 	ระบบตัดคำภาษาไทย
 
@@ -67,6 +58,12 @@ def word_tokenize(text,engine='newmm'):
 	- deepcut ใช้ Deep Neural Network ในการตัดคำภาษาไทย
 	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	"""
+	from pythainlp.tokenize import DEFAULT_DICT_TRIE
+	if custom_dict_trie:
+		trie = custom_dict_trie
+	else:
+		trie = DEFAULT_DICT_TRIE
+	
 	if engine=='icu':
 		'''
 		ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
@@ -84,16 +81,19 @@ def word_tokenize(text,engine='newmm'):
 		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
 		'''
 		from .mm import segment
+		return segment(text, trie)
 	elif engine=='newmm':
 		'''
 		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
 		'''
 		from .newmm import mmcut as segment
+		return segment(text, trie)
 	elif engine=='longest-matching':
 		'''
 		ใช้ Longest matching ในการตัดคำ
 		'''
 		from .longest import segment
+		return segment(text, trie)
 	elif engine=='pylexto':
 		'''
 		ใช้ LexTo ในการตัดคำ
@@ -114,6 +114,7 @@ def word_tokenize(text,engine='newmm'):
 		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		'''
 		from .wordcutpy import segment
+		return segment(text, trie)
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):
@@ -184,6 +185,33 @@ def syllable_tokenize(text1):
 		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
 	return data
 
+def create_custom_dict_trie(custom_dict_source):
+	"""The function is used to create a custom dict trie which will be
+	used for word_tokenize() function
+	
+	Arguments:
+		custom_dict_source {string or list} -- a list of vocaburaries or a path to source file
+	
+	Raises:
+		ValueError -- Invalid custom_dict_source's object type
+	
+	Returns:
+		Trie -- A trie created from custom dict input
+	"""
+
+	if type(custom_dict_source) is str:
+		# Receive a file path of the custom dict to read
+		with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
+			_vocabs = f.read().splitlines()
+			return Trie(_vocabs)
+	elif isinstance(custom_dict_source, (list, tuple, set)):
+		# Received a sequence type object of vocabs
+		return Trie(custom_dict_source)
+	else:
+		raise TypeError(
+			'Type of custom_dict_source must be either str (path to source file) or collections'
+		)
+
 class Tokenizer:
 	def __init__(self, custom_dict=None):
 		"""
@@ -207,4 +235,4 @@ def __init__(self, custom_dict=None):
 	
 	def word_tokenize(self, text, engine='newmm'):
 		from .newmm import mmcut as segment
-		return segment(text, data=self.trie_dict)
\ No newline at end of file
+		return segment(text, self.trie_dict)
\ No newline at end of file
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 841d50fe5..93875750e 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -4,7 +4,6 @@
 Code from https://github.com/patorn/thai-sentiment/blob/78bf461dfdc8a3f0517712fac56dd921dc0f9dd6/thai_sentiment/tokenizer.py
 '''
 import re
-from pythainlp.tools import file_trie
 FRONT_DEP_CHAR = ['ะ', 'ั', 'า ', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ๅ', '็', '์', 'ํ']
 REAR_DEP_CHAR = ['ั', 'ื', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ํ']
 TONAL_CHAR = ['่', '้', '๊', '๋']
@@ -12,12 +11,8 @@
 
 class Tokenizer(object):
 
-    def __init__(self,data=''):
-        if(data==''):
-            self._trie = file_trie(data="old")
-        else:
-            import marisa_trie
-            self._trie = marisa_trie.Trie(data)
+    def __init__(self, trie):
+        self._trie = trie
 
     @property
     def trie(self):
@@ -112,9 +107,6 @@ def segment_text(self, text):
     def tokenize(self, text):
         tokens = self.segment_text(text)
         return tokens
-def segment(s,data=''):
+def segment(s, trie):
     """ตัดคำภาษาไทยด้วย Longest matching"""
-    return Tokenizer(data).tokenize(s)
-if __name__ == "__main__":
-	s = 'สวัสดีชาวโลกเข้าใจกันไหมพวกคุณ โอเคกันไหม ยสยา ดีแล้วนะคุณเธอ'
-	print(segment(s))
\ No newline at end of file
+    return Tokenizer(trie).tokenize(s)
\ No newline at end of file
diff --git a/pythainlp/tokenize/mm.py b/pythainlp/tokenize/mm.py
index fa13d2d2b..ea972e80e 100644
--- a/pythainlp/tokenize/mm.py
+++ b/pythainlp/tokenize/mm.py
@@ -12,16 +12,14 @@
 from six.moves import range,zip
 import codecs
 import re
-from pythainlp.tools import file_trie
 from pythainlp.corpus import stopwords # load  stopwords
-import marisa_trie
 class wordcut(object):
     """
     ตัดคำภาษาไทยด้วย Maximum Matching algorithm
     """
-    def __init__(self, removeRepeat=True, keyDictionary="", stopDictionary="", removeSpaces=True, minLength=1, stopNumber=False, removeNonCharacter=False, caseSensitive=True, ngram=(1,1), negation=False,data=""):
-        if data!="":
-            d = data # load dictionary
+    def __init__(self, trie, removeRepeat=True, keyDictionary="", stopDictionary="", 
+                 removeSpaces=True, minLength=1, stopNumber=False, removeNonCharacter=False, 
+                 caseSensitive=True, ngram=(1,1), negation=False):
         # load negation listdir
         self.negationDict = []
         if negation:
@@ -43,10 +41,7 @@ def __init__(self, removeRepeat=True, keyDictionary="", stopDictionary="", remov
                 for line in f.read().splitlines():
                     self.keydict.append(line)
 
-        if data=="":
-            self.trie = file_trie(data="old")
-        else:
-            self.trie = marisa_trie.Trie(d)
+        self.trie = trie
         self.removeRepeat = removeRepeat
         self.stopNumber = stopNumber
         self.removeSpaces = removeSpaces
@@ -291,9 +286,9 @@ def mergelistlen(listdata,lennum):
         listlen=len(listdata)
         i+=1
     return listdata
-def segment(text,data=""):
+def segment(text, trie):
     '''
     ใช้ในการตัดตำ segment(str) คืนค่า list
     '''
-    pt = wordcut(stopNumber=False, removeNonCharacter=True, caseSensitive=False,removeRepeat=True,data=data)
+    pt = wordcut(stopNumber=False, removeNonCharacter=True, caseSensitive=False,removeRepeat=True, trie=trie)
     return mergelistlen(pt.segment(text),1)
\ No newline at end of file
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 8ceda1b96..c1b560f53 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -9,8 +9,6 @@
 import re
 from collections import defaultdict
 from heapq import heappush, heappop  # for priority queue
-from pythainlp.corpus.thaiword import get_data  # ดึงข้อมูลรายการคำในภาษาไทย
-# from pythainlp.tokenize import DICT_TRIE
 
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
 pat_eng = re.compile(r'''(?x)
@@ -48,8 +46,6 @@
 [เ-ไ]ct
 """.replace('c', '[ก-ฮ]').replace('t', '[่-๋]?').split()
 
-from pythainlp.tools import file_trie
-DICT_TRIE = file_trie(data='old')
 
 def tcc(w):
     p = 0
@@ -82,7 +78,7 @@ def bfs_paths_graph(graph, start, goal):
       else:
         queue.append((next, path+[next]))
 
-def onecut(text):
+def onecut(text, trie):
   graph = defaultdict(list)  # main data structure
   allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc
   
@@ -91,7 +87,7 @@ def onecut(text):
   while q[0] < len(text):
       p = heappop(q)
 
-      for w in DICT_TRIE.prefixes(text[p:]):
+      for w in trie.prefixes(text[p:]):
           p_ = p + len(w)
           if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
             graph[p].append(p_)
@@ -115,7 +111,7 @@ def onecut(text):
           else: # skip น้อยที่สุด ที่เป็นไปได้
               for i in range(p+1, len(text)):
                   if i in allow_pos:   # ใช้ tcc ด้วย
-                      ww = [w for w in DICT_TRIE.prefixes(text[i:]) if (i+len(w) in allow_pos)]
+                      ww = [w for w in trie.prefixes(text[i:]) if (i+len(w) in allow_pos)]
                       m = pat_eng.match(text[i:])
                       if ww or m:
                           break
@@ -129,5 +125,6 @@ def onecut(text):
 
 # ช่วยให้ไม่ต้องพิมพ์ยาวๆ
 
-def mmcut(text):
-    return list(onecut(text))
\ No newline at end of file
+
+def mmcut(text, trie):
+    return list(onecut(text, trie))
\ No newline at end of file
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
index f90aaf1fb..05e010c72 100644
--- a/pythainlp/tokenize/wordcutpy.py
+++ b/pythainlp/tokenize/wordcutpy.py
@@ -17,10 +17,8 @@
 		from wordcut import Wordcut
 	except ImportError:
 		sys.exit('Error ! using $ pip install wordcutpy')
-def segment(text,data=""):
-    if data=="":
-        wordcut = Wordcut.bigthai()
-    else:
-        word_list = list(set(data))
-        wordcut = Wordcut(word_list)
+
+def segment(text, trie):
+    word_list = trie.keys()
+    wordcut = Wordcut(word_list)
     return wordcut.tokenize(text)

From 9113cc4e1b6358a6d6210e603692ce219d87d83d Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 22:40:53 +0700
Subject: [PATCH 06/17] removed duplicated and nested return statement

---
 pythainlp/tokenize/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index f8f3ee459..c6ecdeba8 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,6 +9,7 @@
 from marisa_trie import Trie
 
 DEFAULT_DICT_TRIE = Trie(get_dict())
+TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching', 'wordcutpy']
 
 def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	'''
@@ -81,19 +82,16 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
 		'''
 		from .mm import segment
-		return segment(text, trie)
 	elif engine=='newmm':
 		'''
 		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
 		'''
 		from .newmm import mmcut as segment
-		return segment(text, trie)
 	elif engine=='longest-matching':
 		'''
 		ใช้ Longest matching ในการตัดคำ
 		'''
 		from .longest import segment
-		return segment(text, trie)
 	elif engine=='pylexto':
 		'''
 		ใช้ LexTo ในการตัดคำ
@@ -114,6 +112,8 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		'''
 		from .wordcutpy import segment
+
+	if engine in TRIE_WORD_SEGMENT_ENGINES:
 		return segment(text, trie)
 	return segment(text)
 

From cff5a1aa7f81d584afd486fb510668a75aedb418 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 22:49:02 +0700
Subject: [PATCH 07/17] chaged word sengment function in syllable_tokenize

---
 pythainlp/tokenize/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index c6ecdeba8..e0c7fcbf5 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -179,10 +179,10 @@ def syllable_tokenize(text1):
 	if(len(text1)>0):
 		i=0
 		while(i<len(text1)):
-			data.extend(dict_word_tokenize(text=text1[i],data=get_data(),data_type="list"))
+			data.extend(word_tokenize(text=text1[i]))
 			i+=1
 	else:
-		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
+		data=word_tokenize(text=text1)
 	return data
 
 def create_custom_dict_trie(custom_dict_source):

From 26246ec17db2797aa9321cad6feb479ba4128275 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:19:00 +0700
Subject: [PATCH 08/17] fixed mistakes in syllable_tokenize() and wordcutpy
 tokenizer

---
 pythainlp/tokenize/__init__.py  | 10 +++++++---
 pythainlp/tokenize/wordcutpy.py |  4 +---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index e0c7fcbf5..83b917172 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,7 +9,7 @@
 from marisa_trie import Trie
 
 DEFAULT_DICT_TRIE = Trie(get_dict())
-TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching', 'wordcutpy']
+TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching']
 
 def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	'''
@@ -112,6 +112,9 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		'''
 		from .wordcutpy import segment
+		from wordcut import Wordcut
+		wordcut = Wordcut.bigthai() if trie is DEFAULT_DICT_TRIE else Wordcut(trie.keys())
+		return segment(text, wordcut)
 
 	if engine in TRIE_WORD_SEGMENT_ENGINES:
 		return segment(text, trie)
@@ -176,13 +179,14 @@ def syllable_tokenize(text1):
 	"""
 	text1=word_tokenize(text1)
 	data=[]
+	trie = create_custom_dict_trie(custom_dict_source=get_data())
 	if(len(text1)>0):
 		i=0
 		while(i<len(text1)):
-			data.extend(word_tokenize(text=text1[i]))
+			data.extend(word_tokenize(text=text1[i], custom_dict_trie=trie))
 			i+=1
 	else:
-		data=word_tokenize(text=text1)
+		data=word_tokenize(text=text1, custom_dict_trie=trie)
 	return data
 
 def create_custom_dict_trie(custom_dict_source):
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
index 05e010c72..cee28039c 100644
--- a/pythainlp/tokenize/wordcutpy.py
+++ b/pythainlp/tokenize/wordcutpy.py
@@ -18,7 +18,5 @@
 	except ImportError:
 		sys.exit('Error ! using $ pip install wordcutpy')
 
-def segment(text, trie):
-    word_list = trie.keys()
-    wordcut = Wordcut(word_list)
+def segment(text, wordcut):
     return wordcut.tokenize(text)

From 65aacef2990a4c7d760448ab3045e54818acf781 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:21:42 +0700
Subject: [PATCH 09/17] took nested return out from engine == 'wordcutpy'

---
 pythainlp/tokenize/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 83b917172..0e6059eb5 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -114,10 +114,11 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		from .wordcutpy import segment
 		from wordcut import Wordcut
 		wordcut = Wordcut.bigthai() if trie is DEFAULT_DICT_TRIE else Wordcut(trie.keys())
-		return segment(text, wordcut)
 
 	if engine in TRIE_WORD_SEGMENT_ENGINES:
 		return segment(text, trie)
+	elif engine == 'wordcutpy':
+		return segment(text, wordcut)
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):

From f5575447c16d88b1339effdfaaca38e9fd10e086 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:24:22 +0700
Subject: [PATCH 10/17] took nested return out from engine == 'wordcutpy'

---
 pythainlp/tokenize/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 0e6059eb5..897dc309d 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -108,9 +108,7 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		'''
 		from .cutkum import segment
 	elif engine=='wordcutpy':
-		'''
-		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
-		'''
+		# wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		from .wordcutpy import segment
 		from wordcut import Wordcut
 		wordcut = Wordcut.bigthai() if trie is DEFAULT_DICT_TRIE else Wordcut(trie.keys())

From f70deff840b1c95d28b314369238755e177c1119 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:27:17 +0700
Subject: [PATCH 11/17] fixed deeply nested control flow statements

---
 pythainlp/tokenize/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 897dc309d..92b7f1d96 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -111,12 +111,14 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		# wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		from .wordcutpy import segment
 		from wordcut import Wordcut
-		wordcut = Wordcut.bigthai() if trie is DEFAULT_DICT_TRIE else Wordcut(trie.keys())
+		if trie is DEFAULT_DICT_TRIE:
+			wordcut = Wordcut.bigthai() 
+		else: 
+			wordcut = Wordcut(trie.keys())
+		return segment(text, wordcut)
 
 	if engine in TRIE_WORD_SEGMENT_ENGINES:
 		return segment(text, trie)
-	elif engine == 'wordcutpy':
-		return segment(text, wordcut)
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):

From 8533827818c8c3824f7c3e72d707a6624074c24e Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:29:40 +0700
Subject: [PATCH 12/17] fixed deeply nested control flow statements

---
 pythainlp/tokenize/__init__.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 92b7f1d96..2eed54698 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -108,17 +108,20 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		'''
 		from .cutkum import segment
 	elif engine=='wordcutpy':
-		# wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
+		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
 		from .wordcutpy import segment
+		
+	if engine in TRIE_WORD_SEGMENT_ENGINES:
+		return segment(text, trie)
+	elif engine == 'wordcutpy':
 		from wordcut import Wordcut
 		if trie is DEFAULT_DICT_TRIE:
 			wordcut = Wordcut.bigthai() 
 		else: 
 			wordcut = Wordcut(trie.keys())
 		return segment(text, wordcut)
-
-	if engine in TRIE_WORD_SEGMENT_ENGINES:
-		return segment(text, trie)
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):

From ddbefa93529b0d588b5d2447c9f09ed3f8780da9 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:41:23 +0700
Subject: [PATCH 13/17] improved Coverage

---
 pythainlp/tokenize/__init__.py | 49 +++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 2eed54698..6edb59b8a 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -12,7 +12,7 @@
 TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching']
 
 def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
-	'''
+	"""
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
 	text คือ ข้อความที่ต้องการตัดคำ
@@ -26,7 +26,7 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	- file คือ ไฟล์ข้อมูล
 	- list คือ ข้อมูลที่อยู่ใน list
 	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
-	'''
+	"""
 	if data_type=='file':
 		with codecs.open(file, 'r',encoding='utf8') as f:
 			lines = f.read().splitlines()
@@ -66,51 +66,52 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		trie = DEFAULT_DICT_TRIE
 	
 	if engine=='icu':
-		'''
+		"""
 		ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
 		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
-		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
+		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm
+		"""
 		from .pyicu import segment
 	elif engine=='dict':
-		'''
+		"""
 		ใช้ dicu ในการตัดคำไทย
 		จะคืนค่า False หากไม่สามารถตัดคำไทย
-		'''
+		"""
 		from .dictsegment import segment
 	elif engine=='mm':
-		'''
+		"""
 		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
-		'''
+		"""
 		from .mm import segment
 	elif engine=='newmm':
-		'''
+		"""
 		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
-		'''
+		"""
 		from .newmm import mmcut as segment
 	elif engine=='longest-matching':
-		'''
+		"""
 		ใช้ Longest matching ในการตัดคำ
-		'''
+		"""
 		from .longest import segment
 	elif engine=='pylexto':
-		'''
+		"""
 		ใช้ LexTo ในการตัดคำ
-		'''
+		"""
 		from .pylexto import segment
 	elif engine=='deepcut':
-		'''
+		"""
 		ใช้ Deep Neural Network ในการตัดคำภาษาไทย
-		'''
+		"""
 		from .deepcut import segment
 	elif engine=='cutkum':
-		'''
+		"""
 		ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
-		'''
+		"""
 		from .cutkum import segment
 	elif engine=='wordcutpy':
-		'''
+		"""
 		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
-		'''
+		"""
 		from .wordcutpy import segment
 		
 	if engine in TRIE_WORD_SEGMENT_ENGINES:
@@ -125,20 +126,20 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):
-	'''
+	"""
 	sent_tokenize(text,engine='whitespace+newline')
 	ตัดประโยคเบื้องต้น โดยการแบ่งด้วยช่องว่าง
-	'''
+	"""
 	if engine=='whitespace':
 		data=nltk.tokenize.WhitespaceTokenizer().tokenize(text)
 	elif engine=='whitespace+newline':
 		data=re.sub(r'\n+|\s+','|',text,re.U).split('|')
 	return data
 def wordpunct_tokenize(text):
-	'''
+	"""
 	wordpunct_tokenize(text)
 	It is nltk.tokenize.wordpunct_tokenize(text).
-	'''
+	"""
 	return nltk.tokenize.wordpunct_tokenize(text)
 def WhitespaceTokenizer(text):
 	return nltk.tokenize.WhitespaceTokenizer().tokenize(text)

From 1ec02b668d82c518cca6f7ad384d4d1cf26a53fd Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:43:09 +0700
Subject: [PATCH 14/17] improved Coverage

---
 pythainlp/tokenize/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 6edb59b8a..737abdc47 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -109,9 +109,7 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		"""
 		from .cutkum import segment
 	elif engine=='wordcutpy':
-		"""
-		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
-		"""
+		# wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		from .wordcutpy import segment
 		
 	if engine in TRIE_WORD_SEGMENT_ENGINES:

From 816061c8c979a7cf3ecab967a5ec7f6fb47ffbd6 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Wed, 7 Mar 2018 23:47:46 +0700
Subject: [PATCH 15/17] improved Coverage

---
 pythainlp/tokenize/__init__.py | 49 +++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 737abdc47..2eed54698 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -12,7 +12,7 @@
 TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching']
 
 def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
-	"""
+	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
 	text คือ ข้อความที่ต้องการตัดคำ
@@ -26,7 +26,7 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 	- file คือ ไฟล์ข้อมูล
 	- list คือ ข้อมูลที่อยู่ใน list
 	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
-	"""
+	'''
 	if data_type=='file':
 		with codecs.open(file, 'r',encoding='utf8') as f:
 			lines = f.read().splitlines()
@@ -66,50 +66,51 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		trie = DEFAULT_DICT_TRIE
 	
 	if engine=='icu':
-		"""
+		'''
 		ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
 		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
-		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm
-		"""
+		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
 		from .pyicu import segment
 	elif engine=='dict':
-		"""
+		'''
 		ใช้ dicu ในการตัดคำไทย
 		จะคืนค่า False หากไม่สามารถตัดคำไทย
-		"""
+		'''
 		from .dictsegment import segment
 	elif engine=='mm':
-		"""
+		'''
 		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
-		"""
+		'''
 		from .mm import segment
 	elif engine=='newmm':
-		"""
+		'''
 		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
-		"""
+		'''
 		from .newmm import mmcut as segment
 	elif engine=='longest-matching':
-		"""
+		'''
 		ใช้ Longest matching ในการตัดคำ
-		"""
+		'''
 		from .longest import segment
 	elif engine=='pylexto':
-		"""
+		'''
 		ใช้ LexTo ในการตัดคำ
-		"""
+		'''
 		from .pylexto import segment
 	elif engine=='deepcut':
-		"""
+		'''
 		ใช้ Deep Neural Network ในการตัดคำภาษาไทย
-		"""
+		'''
 		from .deepcut import segment
 	elif engine=='cutkum':
-		"""
+		'''
 		ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
-		"""
+		'''
 		from .cutkum import segment
 	elif engine=='wordcutpy':
-		# wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
+		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
 		from .wordcutpy import segment
 		
 	if engine in TRIE_WORD_SEGMENT_ENGINES:
@@ -124,20 +125,20 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):
-	"""
+	'''
 	sent_tokenize(text,engine='whitespace+newline')
 	ตัดประโยคเบื้องต้น โดยการแบ่งด้วยช่องว่าง
-	"""
+	'''
 	if engine=='whitespace':
 		data=nltk.tokenize.WhitespaceTokenizer().tokenize(text)
 	elif engine=='whitespace+newline':
 		data=re.sub(r'\n+|\s+','|',text,re.U).split('|')
 	return data
 def wordpunct_tokenize(text):
-	"""
+	'''
 	wordpunct_tokenize(text)
 	It is nltk.tokenize.wordpunct_tokenize(text).
-	"""
+	'''
 	return nltk.tokenize.wordpunct_tokenize(text)
 def WhitespaceTokenizer(text):
 	return nltk.tokenize.WhitespaceTokenizer().tokenize(text)

From f4f66b82770bf627818d7b28463211b7d7bf1d86 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Sun, 11 Mar 2018 17:25:51 +0700
Subject: [PATCH 16/17] moved new tokenizing method to dict_work_tokenize
 instead

---
 pythainlp/tokenize/__init__.py  | 40 +++++++++------------------------
 pythainlp/tokenize/longest.py   |  5 ++++-
 pythainlp/tokenize/mm.py        |  8 +++++--
 pythainlp/tokenize/newmm.py     |  5 ++++-
 pythainlp/tokenize/wordcutpy.py |  9 ++++++--
 5 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 2eed54698..d2e18f67b 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -11,28 +11,20 @@
 DEFAULT_DICT_TRIE = Trie(get_dict())
 TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching']
 
-def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
+def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
 	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
 	text คือ ข้อความที่ต้องการตัดคำ
-	file คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
+	custom_dict_trie คือ trie ที่สร้างจาก create_custom_dict_trie
 	engine คือ เครื่องมือตัดคำ
 	- newmm ตัดคำด้วย newmm
     - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	- mm ตัดคำด้วย mm
     - longest-matching ตัดคำโดยใช้ longest matching
-	data_type คือ ชนิดข้อมูล
-	- file คือ ไฟล์ข้อมูล
-	- list คือ ข้อมูลที่อยู่ใน list
-	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
 	'''
-	if data_type=='file':
-		with codecs.open(file, 'r',encoding='utf8') as f:
-			lines = f.read().splitlines()
-		f.close()
-	elif data_type=='list':
-		lines = data
+	trie = custom_dict_trie
+
 	if engine=="newmm":
 		from .newmm import mmcut as segment
 	elif engine=="mm":
@@ -41,7 +33,9 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-	return segment(text,data=lines)
+		return segment(text, trie.keys())
+	
+	return segment(text, trie)
 
 def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 	"""
@@ -59,11 +53,6 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 	- deepcut ใช้ Deep Neural Network ในการตัดคำภาษาไทย
 	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	"""
-	from pythainlp.tokenize import DEFAULT_DICT_TRIE
-	if custom_dict_trie:
-		trie = custom_dict_trie
-	else:
-		trie = DEFAULT_DICT_TRIE
 	
 	if engine=='icu':
 		'''
@@ -112,16 +101,7 @@ def word_tokenize(text, engine='newmm', custom_dict_trie=None):
 		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 		'''
 		from .wordcutpy import segment
-		
-	if engine in TRIE_WORD_SEGMENT_ENGINES:
-		return segment(text, trie)
-	elif engine == 'wordcutpy':
-		from wordcut import Wordcut
-		if trie is DEFAULT_DICT_TRIE:
-			wordcut = Wordcut.bigthai() 
-		else: 
-			wordcut = Wordcut(trie.keys())
-		return segment(text, wordcut)
+
 	return segment(text)
 
 def sent_tokenize(text,engine='whitespace+newline'):
@@ -187,10 +167,10 @@ def syllable_tokenize(text1):
 	if(len(text1)>0):
 		i=0
 		while(i<len(text1)):
-			data.extend(word_tokenize(text=text1[i], custom_dict_trie=trie))
+			data.extend(dict_word_tokenize(text=text1[i], custom_dict_trie=trie))
 			i+=1
 	else:
-		data=word_tokenize(text=text1, custom_dict_trie=trie)
+		data=dict_word_tokenize(text=text1, custom_dict_trie=trie)
 	return data
 
 def create_custom_dict_trie(custom_dict_source):
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 93875750e..6406c1665 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -3,6 +3,7 @@
 '''
 Code from https://github.com/patorn/thai-sentiment/blob/78bf461dfdc8a3f0517712fac56dd921dc0f9dd6/thai_sentiment/tokenizer.py
 '''
+from pythainlp.tokenize import DEFAULT_DICT_TRIE
 import re
 FRONT_DEP_CHAR = ['ะ', 'ั', 'า ', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ๅ', '็', '์', 'ํ']
 REAR_DEP_CHAR = ['ั', 'ื', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ํ']
@@ -107,6 +108,8 @@ def segment_text(self, text):
     def tokenize(self, text):
         tokens = self.segment_text(text)
         return tokens
-def segment(s, trie):
+def segment(s, trie=None):
     """ตัดคำภาษาไทยด้วย Longest matching"""
+    if not trie:
+        trie = DEFAULT_DICT_TRIE
     return Tokenizer(trie).tokenize(s)
\ No newline at end of file
diff --git a/pythainlp/tokenize/mm.py b/pythainlp/tokenize/mm.py
index ea972e80e..baa0fa855 100644
--- a/pythainlp/tokenize/mm.py
+++ b/pythainlp/tokenize/mm.py
@@ -13,6 +13,8 @@
 import codecs
 import re
 from pythainlp.corpus import stopwords # load  stopwords
+from pythainlp.tokenize import DEFAULT_DICT_TRIE
+
 class wordcut(object):
     """
     ตัดคำภาษาไทยด้วย Maximum Matching algorithm
@@ -40,7 +42,7 @@ def __init__(self, trie, removeRepeat=True, keyDictionary="", stopDictionary="",
             with codecs.open(keyDictionary, 'r',encoding='utf8') as f:
                 for line in f.read().splitlines():
                     self.keydict.append(line)
-
+        
         self.trie = trie
         self.removeRepeat = removeRepeat
         self.stopNumber = stopNumber
@@ -286,9 +288,11 @@ def mergelistlen(listdata,lennum):
         listlen=len(listdata)
         i+=1
     return listdata
-def segment(text, trie):
+def segment(text, trie=None):
     '''
     ใช้ในการตัดตำ segment(str) คืนค่า list
     '''
+    if not trie:
+        trie = DEFAULT_DICT_TRIE
     pt = wordcut(stopNumber=False, removeNonCharacter=True, caseSensitive=False,removeRepeat=True, trie=trie)
     return mergelistlen(pt.segment(text),1)
\ No newline at end of file
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index c1b560f53..880f3b4a9 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -9,6 +9,7 @@
 import re
 from collections import defaultdict
 from heapq import heappush, heappop  # for priority queue
+from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
 pat_eng = re.compile(r'''(?x)
@@ -126,5 +127,7 @@ def onecut(text, trie):
 # ช่วยให้ไม่ต้องพิมพ์ยาวๆ
 
 
-def mmcut(text, trie):
+def mmcut(text, trie=None):
+    if not trie:
+        trie = DEFAULT_DICT_TRIE
     return list(onecut(text, trie))
\ No newline at end of file
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
index cee28039c..02f0c10aa 100644
--- a/pythainlp/tokenize/wordcutpy.py
+++ b/pythainlp/tokenize/wordcutpy.py
@@ -18,5 +18,10 @@
 	except ImportError:
 		sys.exit('Error ! using $ pip install wordcutpy')
 
-def segment(text, wordcut):
-    return wordcut.tokenize(text)
+def segment(text, data=None):
+    if not data:
+        wordcut = Wordcut.bigthai()
+    else:
+        word_list = list(set(data))
+        wordcut = Wordcut(word_list)
+    return wordcut.tokenize(text)
\ No newline at end of file

From 028cb85d1ee8cc53af0e7b2c57ed072e8c281568 Mon Sep 17 00:00:00 2001
From: smeeklai <wizer_boss@hotmail.com>
Date: Sun, 11 Mar 2018 17:33:08 +0700
Subject: [PATCH 17/17] removed unused variables

---
 pythainlp/tokenize/__init__.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index d2e18f67b..d75fa8629 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -9,7 +9,6 @@
 from marisa_trie import Trie
 
 DEFAULT_DICT_TRIE = Trie(get_dict())
-TRIE_WORD_SEGMENT_ENGINES = ['newmm', 'mm', 'longest-matching']
 
 def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
 	'''
@@ -23,7 +22,6 @@ def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
 	- mm ตัดคำด้วย mm
     - longest-matching ตัดคำโดยใช้ longest matching
 	'''
-	trie = custom_dict_trie
 
 	if engine=="newmm":
 		from .newmm import mmcut as segment
@@ -33,11 +31,11 @@ def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-		return segment(text, trie.keys())
+		return segment(text, custom_dict_trie.keys())
 	
-	return segment(text, trie)
+	return segment(text, custom_dict_trie)
 
-def word_tokenize(text, engine='newmm', custom_dict_trie=None):
+def word_tokenize(text, engine='newmm'):
 	"""
 	ระบบตัดคำภาษาไทย