PyThaiNLP · wannaphong · Mar 12, 2018 · Mar 1, 2018 · Mar 1, 2018 · Mar 3, 2018
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -5,28 +5,24 @@
 import codecs
 from six.moves import zip
 from pythainlp.corpus.thaisyllable import get_data
-def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
+from pythainlp.corpus.thaiword import get_data as get_dict
+from marisa_trie import Trie
+
+DEFAULT_DICT_TRIE = Trie(get_dict())
+
+def dict_word_tokenize(text, custom_dict_trie, engine='newmm'):
 	'''
 	dict_word_tokenize(text,file,engine)
 	เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด
 	text คือ ข้อความที่ต้องการตัดคำ
-	file คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
+	custom_dict_trie คือ trie ที่สร้างจาก create_custom_dict_trie
 	engine คือ เครื่องมือตัดคำ
 	- newmm ตัดคำด้วย newmm
     - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	- mm ตัดคำด้วย mm
     - longest-matching ตัดคำโดยใช้ longest matching
-	data_type คือ ชนิดข้อมูล
-	- file คือ ไฟล์ข้อมูล
-	- list คือ ข้อมูลที่อยู่ใน list
-	กรณีที่ใช้ list ต้องใช้ data=list(ข้อมูล)
 	'''
-	if data_type=='file':
-		with codecs.open(file, 'r',encoding='utf8') as f:
-			lines = f.read().splitlines()
-		f.close()
-	elif data_type=='list':
-		lines = data
+
 	if engine=="newmm":
 		from .newmm import mmcut as segment
 	elif engine=="mm":
@@ -35,8 +31,11 @@ def dict_word_tokenize(text,file='',engine="newmm",data=[''],data_type="file"):
 		from .longest import segment
 	elif engine=='wordcutpy':
 		from .wordcutpy import segment
-	return segment(text,data=lines)
-def word_tokenize(text,engine='newmm'):
+		return segment(text, custom_dict_trie.keys())
+
+	return segment(text, custom_dict_trie)
+
+def word_tokenize(text, engine='newmm'):
 	"""
 	ระบบตัดคำภาษาไทย
 
@@ -52,54 +51,57 @@ def word_tokenize(text,engine='newmm'):
 	- deepcut ใช้ Deep Neural Network ในการตัดคำภาษาไทย
 	- wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
 	"""
+
 	if engine=='icu':
-    		'''
-			ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
-    		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
-			ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
-    		from .pyicu import segment
+		'''
+		ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ
+		คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6
+		ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm'''
+		from .pyicu import segment
 	elif engine=='dict':
-    		'''
-			ใช้ dicu ในการตัดคำไทย
-			จะคืนค่า False หากไม่สามารถตัดคำไทย
-			'''
-    		from .dictsegment import segment
+		'''
+		ใช้ dicu ในการตัดคำไทย
+		จะคืนค่า False หากไม่สามารถตัดคำไทย
+		'''
+		from .dictsegment import segment
 	elif engine=='mm':
-    		'''
-			ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
-			'''
-    		from .mm import segment
+		'''
+		ใช้ Maximum Matching algorithm - โค้ดชุดเก่า
+		'''
+		from .mm import segment
 	elif engine=='newmm':
-    		'''
-			ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
-			'''
-    		from .newmm import mmcut as segment
+		'''
+		ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่
+		'''
+		from .newmm import mmcut as segment
 	elif engine=='longest-matching':
-    		'''
-			ใช้ Longest matching ในการตัดคำ
-			'''
-    		from .longest import segment
+		'''
+		ใช้ Longest matching ในการตัดคำ
+		'''
+		from .longest import segment
 	elif engine=='pylexto':
-    		'''
-			ใช้ LexTo ในการตัดคำ
-			'''
-    		from .pylexto import segment
+		'''
+		ใช้ LexTo ในการตัดคำ
+		'''
+		from .pylexto import segment
 	elif engine=='deepcut':
-    		'''
-			ใช้ Deep Neural Network ในการตัดคำภาษาไทย
-			'''
-    		from .deepcut import segment
+		'''
+		ใช้ Deep Neural Network ในการตัดคำภาษาไทย
+		'''
+		from .deepcut import segment
 	elif engine=='cutkum':
-    		'''
-			ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
-			'''
-    		from .cutkum import segment
+		'''
+		ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
+		'''
+		from .cutkum import segment
 	elif engine=='wordcutpy':
-    		'''
-			wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
-			'''
-    		from .wordcutpy import segment
+		'''
+		wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
+		'''
+		from .wordcutpy import segment
+
 	return segment(text)
+
 def sent_tokenize(text,engine='whitespace+newline'):
 	'''
 	sent_tokenize(text,engine='whitespace+newline')
@@ -119,37 +121,37 @@ def wordpunct_tokenize(text):
 def WhitespaceTokenizer(text):
 	return nltk.tokenize.WhitespaceTokenizer().tokenize(text)
 def isthai(text,check_all=False):
-    """
-    สำหรับเช็คว่าเป็นตัวอักษรภาษาไทยหรือไม่
-    isthai(text,check_all=False)
-    text คือ ข้อความหรือ list ตัวอักษร
-    check_all สำหรับส่งคืนค่า True หรือ False เช็คทุกตัวอักษร
+	"""
+	สำหรับเช็คว่าเป็นตัวอักษรภาษาไทยหรือไม่
+	isthai(text,check_all=False)
+	text คือ ข้อความหรือ list ตัวอักษร
+	check_all สำหรับส่งคืนค่า True หรือ False เช็คทุกตัวอักษร
 
-    การส่งคืนค่า
-    {'thai':% อักษรภาษาไทย,'check_all':tuple โดยจะเป็น (ตัวอักษร,True หรือ False)}
-    """
-    listext=list(text)
-    i=0
-    num_isthai=0
-    if check_all==True:
-        listthai=[]
-    while i<len(listext):
-        cVal = ord(listext[i])
-        if(cVal >= 3584 and cVal <= 3711):
-            num_isthai+=1
-            if check_all==True:
-                listthai.append(True)
-        else:
-            if check_all==True:
-                listthai.append(False)
-        i+=1
-    thai=(num_isthai/len(listext))*100
-    if check_all==True:
-        dictthai=tuple(zip(listext,listthai))
-        data= {'thai':thai,'check_all':dictthai}
-    else:
-        data= {'thai':thai}
-    return data
+	การส่งคืนค่า
+	{'thai':% อักษรภาษาไทย,'check_all':tuple โดยจะเป็น (ตัวอักษร,True หรือ False)}
+	"""
+	listext=list(text)
+	i=0
+	num_isthai=0
+	if check_all==True:
+		listthai=[]
+	while i<len(listext):
+		cVal = ord(listext[i])
+		if(cVal >= 3584 and cVal <= 3711):
+			num_isthai+=1
+			if check_all==True:
+				listthai.append(True)
+		else:
+			if check_all==True:
+				listthai.append(False)
+		i+=1
+	thai=(num_isthai/len(listext))*100
+	if check_all==True:
+		dictthai=tuple(zip(listext,listthai))
+		data= {'thai':thai,'check_all':dictthai}
+	else:
+		data= {'thai':thai}
+	return data
 def syllable_tokenize(text1):
 	"""
 	syllable_tokenize(text)
@@ -159,11 +161,64 @@ def syllable_tokenize(text1):
 	"""
 	text1=word_tokenize(text1)
 	data=[]
+	trie = create_custom_dict_trie(custom_dict_source=get_data())
 	if(len(text1)>0):
 		i=0
 		while(i<len(text1)):
-			data.extend(dict_word_tokenize(text=text1[i],data=get_data(),data_type="list"))
+			data.extend(dict_word_tokenize(text=text1[i], custom_dict_trie=trie))
 			i+=1
 	else:
-		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
+		data=dict_word_tokenize(text=text1, custom_dict_trie=trie)
 	return data
+
+def create_custom_dict_trie(custom_dict_source):
+	"""The function is used to create a custom dict trie which will be
+	used for word_tokenize() function
+
+	Arguments:
+		custom_dict_source {string or list} -- a list of vocaburaries or a path to source file
+
+	Raises:
+		ValueError -- Invalid custom_dict_source's object type
+
+	Returns:
+		Trie -- A trie created from custom dict input
+	"""
+
+	if type(custom_dict_source) is str:
+		# Receive a file path of the custom dict to read
+		with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
+			_vocabs = f.read().splitlines()
+			return Trie(_vocabs)
+	elif isinstance(custom_dict_source, (list, tuple, set)):
+		# Received a sequence type object of vocabs
+		return Trie(custom_dict_source)
+	else:
+		raise TypeError(
+			'Type of custom_dict_source must be either str (path to source file) or collections'
+		)
+
+class Tokenizer:
+	def __init__(self, custom_dict=None):
+		"""
+		Initialize tokenizer object
+
+		Keyword arguments:
+		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)
+
+		Object variables:
+		trie_dict -- a trie to use in tokenizing engines
+		"""
+		if custom_dict:
+			if type(custom_dict) is list:
+				self.trie_dict = Trie(custom_dict)
+			elif type(custom_dict) is str:
+				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
+					vocabs = f.read().splitlines()
+				self.trie_dict = Trie(vocabs)
+		else:
+			self.trie_dict = Trie(get_dict())
+
+	def word_tokenize(self, text, engine='newmm'):
+		from .newmm import mmcut as segment
+		return segment(text, self.trie_dict)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -3,21 +3,17 @@
 '''
 Code from https://github.com/patorn/thai-sentiment/blob/78bf461dfdc8a3f0517712fac56dd921dc0f9dd6/thai_sentiment/tokenizer.py
 '''
+from pythainlp.tokenize import DEFAULT_DICT_TRIE
 import re
-from pythainlp.tools import file_trie
 FRONT_DEP_CHAR = ['ะ', 'ั', 'า ', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ๅ', '็', '์', 'ํ']
 REAR_DEP_CHAR = ['ั', 'ื', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ํ']
 TONAL_CHAR = ['่', '้', '๊', '๋']
 ENDING_CHAR = ['ๆ', 'ฯ']
 
 class Tokenizer(object):
 
-    def __init__(self,data=''):
-        if(data==''):
-            self._trie = file_trie(data="old")
-        else:
-            import marisa_trie
-            self._trie = marisa_trie.Trie(data)
+    def __init__(self, trie):
+        self._trie = trie
 
     @property
     def trie(self):
@@ -112,9 +108,8 @@ def segment_text(self, text):
     def tokenize(self, text):
         tokens = self.segment_text(text)
         return tokens
-def segment(s,data=''):
+def segment(s, trie=None):
     """ตัดคำภาษาไทยด้วย Longest matching"""
-    return Tokenizer(data).tokenize(s)
-if __name__ == "__main__":
-	s = 'สวัสดีชาวโลกเข้าใจกันไหมพวกคุณ โอเคกันไหม ยสยา ดีแล้วนะคุณเธอ'
-	print(segment(s))
+    if not trie:
+        trie = DEFAULT_DICT_TRIE
+    return Tokenizer(trie).tokenize(s)
diff --git a/pythainlp/tokenize/mm.py b/pythainlp/tokenize/mm.py
@@ -12,16 +12,16 @@
 from six.moves import range,zip
 import codecs
 import re
-from pythainlp.tools import file_trie
 from pythainlp.corpus import stopwords # load  stopwords
-import marisa_trie
+from pythainlp.tokenize import DEFAULT_DICT_TRIE
+
 class wordcut(object):
     """
     ตัดคำภาษาไทยด้วย Maximum Matching algorithm
     """
-    def __init__(self, removeRepeat=True, keyDictionary="", stopDictionary="", removeSpaces=True, minLength=1, stopNumber=False, removeNonCharacter=False, caseSensitive=True, ngram=(1,1), negation=False,data=""):
-        if data!="":
-            d = data # load dictionary
+    def __init__(self, trie, removeRepeat=True, keyDictionary="", stopDictionary="", 
+                 removeSpaces=True, minLength=1, stopNumber=False, removeNonCharacter=False, 
+                 caseSensitive=True, ngram=(1,1), negation=False):
         # load negation listdir
         self.negationDict = []
         if negation:
@@ -42,11 +42,8 @@ def __init__(self, removeRepeat=True, keyDictionary="", stopDictionary="", remov
             with codecs.open(keyDictionary, 'r',encoding='utf8') as f:
                 for line in f.read().splitlines():
                     self.keydict.append(line)
-
-        if data=="":
-            self.trie = file_trie(data="old")
-        else:
-            self.trie = marisa_trie.Trie(d)
+
+        self.trie = trie
         self.removeRepeat = removeRepeat
         self.stopNumber = stopNumber
         self.removeSpaces = removeSpaces
@@ -291,9 +288,11 @@ def mergelistlen(listdata,lennum):
         listlen=len(listdata)
         i+=1
     return listdata
-def segment(text,data=""):
+def segment(text, trie=None):
     '''
     ใช้ในการตัดตำ segment(str) คืนค่า list
     '''
-    pt = wordcut(stopNumber=False, removeNonCharacter=True, caseSensitive=False,removeRepeat=True,data=data)
+    if not trie:
+        trie = DEFAULT_DICT_TRIE
+    pt = wordcut(stopNumber=False, removeNonCharacter=True, caseSensitive=False,removeRepeat=True, trie=trie)
     return mergelistlen(pt.segment(text),1)