Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code cleaning + small optimization #133

Merged
merged 3 commits into from
Oct 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/romanization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-

from pythainlp.romanization import romanization
from pythainlp.romanization import romanize

print(romanization("แมว"))
print(romanize("แมว"))
2 changes: 1 addition & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__version__ = 1.7
from pythainlp.sentiment import sentiment
from pythainlp.spell import spell
from pythainlp.romanization import romanization
from pythainlp.romanization import romanize
from pythainlp.tokenize import word_tokenize,sent_tokenize,tcc,etcc
from pythainlp.rank import rank
from pythainlp.change import texttothai,texttoeng
Expand Down
6 changes: 4 additions & 2 deletions pythainlp/chunk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
#from __future__ import absolute_import,unicode_literals
# TODO

# from __future__ import absolute_import, unicode_literals

# TODO: Chunking
30 changes: 21 additions & 9 deletions pythainlp/collation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
# -*- coding: utf-8 -*-
"""
Thai collation (sort according to dictionary order)
For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
https://unicode.org/cldr/charts/latest/collation/th.html
"""
from __future__ import absolute_import, unicode_literals, print_function
import re

RE_TONE = re.compile(r"[็-์]")
RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")

try:
import icu
thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey

thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
except ImportError:

def thkey(word):
cv = re.sub('[็-์]', '', word,re.U) # remove tone
cv = re.sub('([เ-ไ])([ก-ฮ])', '\\2\\1', cv,re.U) # switch lead vowel
tone = re.sub('[^็-์]', ' ', word,re.U) # just tone
return cv+tone
cv = RE_TONE.sub("", word) # remove tone
cv = RE_LV_C.sub("\\2\\1", cv) # switch lead vowel
tone = RE_TONE.sub(" ", word) # just tone
return cv + tone


def collation(data):
"""
Expand All @@ -23,8 +34,9 @@ def collation(data):
"""
return sorted(data, key=thkey)


if __name__ == "__main__":
a=collation(['ไก่','ไข่','ก','ฮา'])==['ก', 'ไก่', 'ไข่', 'ฮา']
print(a)
print(collation(['หลาย','หญิง'])==['หญิง','หลาย'])
print(collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])==['ไก่', 'เป็ด', 'วัว', 'หมู'])
a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
print(a)
print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])
50 changes: 25 additions & 25 deletions pythainlp/romanization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals

from __future__ import absolute_import, unicode_literals
from pythainlp.tokenize import word_tokenize
# ถอดเสียงภาษาไทยเป็น Latin
def romanization(data,engine='royin'):
"""
:param str data: Thai text to be romanized
:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
:return: English (more or less) text that spells out how the Thai text should read.
"""
word_list=word_tokenize(data)
listword=[]
i=0
if engine=='royin':
from .royin import romanization
elif engine=='pyicu':
from .pyicu import romanization
elif engine=='thai2rom':
from pythainlp.romanization.thai2rom import thai2rom
thai=thai2rom()
return thai.romanization(data)
else:
raise Exception("error no have engine.")
while i<len(word_list):
listword.append(romanization(word_list[i]))
i+=1
return ''.join(listword)


# ถอดเสียงภาษาไทยเป็นอักษรละติน
def romanize(text, engine="royin"):
"""
:param str data: Thai text to be romanized
:param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
:return: English (more or less) text that spells out how the Thai text should read.
"""
if engine == "pyicu":
from .pyicu import romanize
elif engine == "thai2rom":
from .thai2rom import ThaiTransliterator

thai2rom = ThaiTransliterator()
return thai2rom.romanize(text)
else: # use default engine "royin"
from .royin import romanize

words = word_tokenize(text)
romanized_words = [romanize(word) for word in words]

return "".join(romanized_words)
30 changes: 17 additions & 13 deletions pythainlp/romanization/pyicu.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals

from __future__ import absolute_import, unicode_literals
import sys

try:
import icu
import icu
except ImportError:
from pythainlp.tools import install_package
install_package('pyicu')
try:
import icu
except ImportError:
sys.exit('Error ! using pip install pyicu')
from pythainlp.tools import install_package

install_package("pyicu")
try:
import icu
except ImportError:
sys.exit("Error: please pip install pyicu")


# ถอดเสียงภาษาไทยเป็น Latin
def romanization(data):
"""เป็นคำสั่ง ถอดเสียงภาษาไทยเป็น Latin รับค่า ''str'' ข้อความ คืนค่าเป็น ''str'' ข้อความ Latin"""
thai2latin = icu.Transliterator.createInstance('Thai-Latin')
return thai2latin.transliterate(data)
# ถอดเสียงภาษาไทยเป็นอักษรละติน
def romanize(data):
"""ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน"""
thai2latin = icu.Transliterator.createInstance("Thai-Latin")
return thai2latin.transliterate(data)
Loading