PyThaiNLP · wannaphong · Oct 21, 2018 · Oct 21, 2018 · Oct 21, 2018 · Oct 21, 2018
diff --git a/examples/romanization.py b/examples/romanization.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 
-from pythainlp.romanization import romanization
+from pythainlp.romanization import romanize
 
-print(romanization("แมว"))
+print(romanize("แมว"))
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -3,7 +3,7 @@
 __version__ = 1.7
 from pythainlp.sentiment import sentiment
 from pythainlp.spell import spell
-from pythainlp.romanization import romanization
+from pythainlp.romanization import romanize
 from pythainlp.tokenize import word_tokenize,sent_tokenize,tcc,etcc
 from pythainlp.rank import rank
 from pythainlp.change import texttothai,texttoeng

diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py
@@ -1,3 +1,5 @@
 # -*- coding: utf-8 -*-
-#from __future__ import absolute_import,unicode_literals
-# TODO
+
+# from __future__ import absolute_import, unicode_literals
+
+# TODO: Chunking
diff --git a/pythainlp/collation/__init__.py b/pythainlp/collation/__init__.py
@@ -1,16 +1,27 @@
 # -*- coding: utf-8 -*-
+"""
+Thai collation (sort according to dictionary order)
+For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
+https://unicode.org/cldr/charts/latest/collation/th.html
+"""
 from __future__ import absolute_import, unicode_literals, print_function
 import re
 
+RE_TONE = re.compile(r"[็-์]")
+RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
+
 try:
     import icu
-    thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey
+
+    thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
 except ImportError:
+
     def thkey(word):
-        cv = re.sub('[็-์]', '', word,re.U) # remove tone
-        cv = re.sub('([เ-ไ])([ก-ฮ])', '\\2\\1', cv,re.U) # switch lead vowel
-        tone = re.sub('[^็-์]', ' ', word,re.U) # just tone
-        return cv+tone
+        cv = RE_TONE.sub("", word)  # remove tone
+        cv = RE_LV_C.sub("\\2\\1", cv)  # switch lead vowel
+        tone = RE_TONE.sub(" ", word)  # just tone
+        return cv + tone
+
 
 def collation(data):
     """
@@ -23,8 +34,9 @@ def collation(data):
     """
     return sorted(data, key=thkey)
 
+
 if __name__ == "__main__":
-	a=collation(['ไก่','ไข่','ก','ฮา'])==['ก', 'ไก่', 'ไข่', 'ฮา']
-	print(a)
-	print(collation(['หลาย','หญิง'])==['หญิง','หลาย'])
-	print(collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])==['ไก่', 'เป็ด', 'วัว', 'หมู'])
+    a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
+    print(a)
+    print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
+    print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])
diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py
@@ -1,27 +1,27 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
+
+from __future__ import absolute_import, unicode_literals
 from pythainlp.tokenize import word_tokenize
-# ถอดเสียงภาษาไทยเป็น Latin
-def romanization(data,engine='royin'):
-	"""
-	:param str data: Thai text to be romanized
-	:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
-	:return: English (more or less) text that spells out how the Thai text should read.
-	"""
-	word_list=word_tokenize(data)
-	listword=[]
-	i=0
-	if engine=='royin':
-    		from .royin import romanization
-	elif engine=='pyicu':
-    		from .pyicu import romanization
-	elif engine=='thai2rom':
-    		from pythainlp.romanization.thai2rom import thai2rom
-    		thai=thai2rom()
-    		return thai.romanization(data)
-	else:
-    		raise Exception("error no have engine.")
-	while i<len(word_list):
-		listword.append(romanization(word_list[i]))
-		i+=1
-	return ''.join(listword)
+
+
+# ถอดเสียงภาษาไทยเป็นอักษรละติน
+def romanize(text, engine="royin"):
+    """
+    :param str data: Thai text to be romanized
+    :param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
+    :return: English (more or less) text that spells out how the Thai text should read.
+    """
+    if engine == "pyicu":
+        from .pyicu import romanize
+    elif engine == "thai2rom":
+        from .thai2rom import ThaiTransliterator
+
+        thai2rom = ThaiTransliterator()
+        return thai2rom.romanize(text)
+    else:  # use default engine "royin"
+        from .royin import romanize
+
+    words = word_tokenize(text)
+    romanized_words = [romanize(word) for word in words]
+
+    return "".join(romanized_words)
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
+
+from __future__ import absolute_import, unicode_literals
 import sys
+
 try:
-	import icu
+    import icu
 except ImportError:
-	from pythainlp.tools import install_package
-	install_package('pyicu')
-	try:
-		import icu
-	except ImportError:
-		sys.exit('Error ! using pip install pyicu')
+    from pythainlp.tools import install_package
+
+    install_package("pyicu")
+    try:
+        import icu
+    except ImportError:
+        sys.exit("Error: please pip install pyicu")
+
 
-# ถอดเสียงภาษาไทยเป็น Latin
-def romanization(data):
-	"""เป็นคำสั่ง ถอดเสียงภาษาไทยเป็น Latin รับค่า ''str'' ข้อความ คืนค่าเป็น ''str'' ข้อความ Latin"""
-	thai2latin = icu.Transliterator.createInstance('Thai-Latin')
-	return thai2latin.transliterate(data)
+# ถอดเสียงภาษาไทยเป็นอักษรละติน
+def romanize(data):
+    """ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน"""
+    thai2latin = icu.Transliterator.createInstance("Thai-Latin")
+    return thai2latin.transliterate(data)