Skip to content

Commit

Permalink
Merge 9532cb0 into 590e24d
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Jan 31, 2021
2 parents 590e24d + 9532cb0 commit e25af96
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 232 deletions.
100 changes: 50 additions & 50 deletions pythainlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,50 @@
# -*- coding: utf-8 -*-
__version__ = "2.2.6"

thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars

thai_vowels = (
"\u0e24\u0e26\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37"
+ "\u0e38\u0e39\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e4d\u0e47"
) # 20
thai_lead_vowels = "\u0e40\u0e41\u0e42\u0e43\u0e44" # 5
thai_follow_vowels = "\u0e30\u0e32\u0e33\u0e45" # 4
thai_above_vowels = "\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47" # 7
thai_below_vowels = "\u0e38\u0e39" # 2

thai_tonemarks = "\u0e48\u0e49\u0e4a\u0e4b" # 4

# Paiyannoi, Maiyamok, Phinthu, Thanthakhat, Nikhahit, Yamakkan:
# These signs can be part of a word
thai_signs = "\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e" # 6 chars

# Any Thai character that can be part of a word
thai_letters = "".join(
[thai_consonants, thai_vowels, thai_tonemarks, thai_signs]
) # 74

# Fongman, Angkhankhu, Khomut:
# These characters are section markers
thai_punctuations = "\u0e4f\u0e5a\u0e5b" # 3 chars

thai_digits = "๐๑๒๓๔๕๖๗๘๙" # 10
thai_symbols = "\u0e3f" # Thai Bath ฿

# All Thai characters that presented in Unicode
thai_characters = "".join(
[thai_letters, thai_punctuations, thai_digits, thai_symbols]
)


from pythainlp.soundex import soundex
from pythainlp.spell import correct, spell
from pythainlp.tag import pos_tag
from pythainlp.tokenize import (
Tokenizer,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)
from pythainlp.transliterate import romanize, transliterate
from pythainlp.util import collate, thai_strftime
# -*- coding: utf-8 -*-
__version__ = "2.2.7-dev0"

thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars

thai_vowels = (
"\u0e24\u0e26\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37"
+ "\u0e38\u0e39\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e4d\u0e47"
) # 20
thai_lead_vowels = "\u0e40\u0e41\u0e42\u0e43\u0e44" # 5
thai_follow_vowels = "\u0e30\u0e32\u0e33\u0e45" # 4
thai_above_vowels = "\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47" # 7
thai_below_vowels = "\u0e38\u0e39" # 2

thai_tonemarks = "\u0e48\u0e49\u0e4a\u0e4b" # 4

# Paiyannoi, Maiyamok, Phinthu, Thanthakhat, Nikhahit, Yamakkan:
# These signs can be part of a word
thai_signs = "\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e" # 6 chars

# Any Thai character that can be part of a word
thai_letters = "".join(
[thai_consonants, thai_vowels, thai_tonemarks, thai_signs]
) # 74

# Fongman, Angkhankhu, Khomut:
# These characters are section markers
thai_punctuations = "\u0e4f\u0e5a\u0e5b" # 3 chars

thai_digits = "๐๑๒๓๔๕๖๗๘๙" # 10
thai_symbols = "\u0e3f" # Thai Bath ฿

# All Thai characters that presented in Unicode
thai_characters = "".join(
[thai_letters, thai_punctuations, thai_digits, thai_symbols]
)


from pythainlp.soundex import soundex
from pythainlp.spell import correct, spell
from pythainlp.tag import pos_tag
from pythainlp.tokenize import (
Tokenizer,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)
from pythainlp.transliterate import romanize, transliterate
from pythainlp.util import collate, thai_strftime
4 changes: 2 additions & 2 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ def pos_tag(
return []

if engine == "perceptron":
from .perceptron import tag as tag_
from pythainlp.tag.perceptron import tag as tag_
else: # default, use "unigram" ("old") engine
from .unigram import tag as tag_
from pythainlp.tag.unigram import tag as tag_

word_tags = tag_(words, corpus=corpus)

Expand Down
24 changes: 12 additions & 12 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
['และ', 'คุณ', 'เล่น', 'มือถือ'],
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
from .crfcls import segment
from pythainlp.tokenize.crfcls import segment

return segment(doc)

Expand Down Expand Up @@ -132,35 +132,35 @@ def word_tokenize(
segments = []

if engine == "newmm" or engine == "onecut":
from .newmm import segment
from pythainlp.tokenize.newmm import segment

segments = segment(text, custom_dict)
elif engine == "newmm-safe":
from .newmm import segment
from pythainlp.tokenize.newmm import segment

segments = segment(text, custom_dict, safe_mode=True)
elif engine == "attacut":
from .attacut import segment
from pythainlp.tokenize.attacut import segment

segments = segment(text)
elif engine == "longest":
from .longest import segment
from pythainlp.tokenize.longest import segment

segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
from pythainlp.tokenize.multi_cut import segment

segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
from pythainlp.tokenize.deepcut import segment

if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
from pythainlp.tokenize.pyicu import segment

segments = segment(text)
else:
Expand Down Expand Up @@ -244,7 +244,7 @@ def sent_tokenize(
segments = []

if engine == "crfcut":
from .crfcut import segment
from pythainlp.tokenize.crfcut import segment

segments = segment(text)
elif engine == "whitespace":
Expand Down Expand Up @@ -325,9 +325,9 @@ def subword_tokenize(
return []

if engine == "tcc":
from .tcc import segment
from pythainlp.tokenize.tcc import segment
elif engine == "etcc":
from .etcc import segment
from pythainlp.tokenize.etcc import segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down Expand Up @@ -394,7 +394,7 @@ def syllable_tokenize(
)
)
elif engine == "ssg":
from .ssg import segment
from pythainlp.tokenize.ssg import segment

segments = segment(text)
else:
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
from pythainlp.util import Trie

from .tcc import tcc_pos
from pythainlp.tokenize.tcc import tcc_pos

# match non-Thai tokens
_PAT_NONTHAI = re.compile(
Expand Down
10 changes: 5 additions & 5 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
return ""

if engine == "thai2rom":
from .thai2rom import romanize
from pythainlp.transliterate.thai2rom import romanize
else: # use default engine "royin"
from .royin import romanize
from pythainlp.transliterate.royin import romanize

return romanize(text)

Expand Down Expand Up @@ -100,10 +100,10 @@ def transliterate(
return ""

if engine == "icu" or engine == "pyicu":
from .pyicu import transliterate
from pythainlp.transliterate.pyicu import transliterate
elif engine == "thaig2p":
from .thaig2p import transliterate
from pythainlp.transliterate.thaig2p import transliterate
else:
from .ipa import transliterate
from pythainlp.transliterate.ipa import transliterate

return transliterate(text)
64 changes: 32 additions & 32 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
[bumpversion]
current_version = 2.2.6
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{build}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = prod
first_value = dev
values =
dev
beta
prod

[bumpversion:part:build]

[bumpversion:file:setup.py]
search = version={current_version}
replace = {new_version}

[bumpversion:file:pythainlp/__init__.py]
search = __version__={current_version}
replace = {new_version}

[metadata]
description-file = README.md

[coverage:run]
source = pythainlp
[bumpversion]
current_version = 2.2.7-dev0
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{build}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = prod
first_value = dev
values =
dev
beta
prod

[bumpversion:part:build]

[bumpversion:file:setup.py]
search = version="{current_version}"
replace = version="{new_version}"

[bumpversion:file:pythainlp/__init__.py]
search = __version__ = "{current_version}"
replace = __version__ = "{new_version}"

[metadata]
description-file = README.md

[coverage:run]
source = pythainlp

0 comments on commit e25af96

Please sign in to comment.