Merge 9532cb0 into 590e24d

PyThaiNLP · Jan 31, 2021 · e25af96 · e25af96
2 parents 590e24d + 9532cb0
commit e25af96
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 232 deletions.
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -1,50 +1,50 @@
-# -*- coding: utf-8 -*-
-__version__ = "2.2.6"
-
-thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"  # 44 chars
-
-thai_vowels = (
-    "\u0e24\u0e26\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37"
-    + "\u0e38\u0e39\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e4d\u0e47"
-)  # 20
-thai_lead_vowels = "\u0e40\u0e41\u0e42\u0e43\u0e44"  # 5
-thai_follow_vowels = "\u0e30\u0e32\u0e33\u0e45"  # 4
-thai_above_vowels = "\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47"  # 7
-thai_below_vowels = "\u0e38\u0e39"  # 2
-
-thai_tonemarks = "\u0e48\u0e49\u0e4a\u0e4b"  # 4
-
-# Paiyannoi, Maiyamok, Phinthu, Thanthakhat, Nikhahit, Yamakkan:
-# These signs can be part of a word
-thai_signs = "\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e"  # 6 chars
-
-# Any Thai character that can be part of a word
-thai_letters = "".join(
-    [thai_consonants, thai_vowels, thai_tonemarks, thai_signs]
-)  # 74
-
-# Fongman, Angkhankhu, Khomut:
-# These characters are section markers
-thai_punctuations = "\u0e4f\u0e5a\u0e5b"  # 3 chars
-
-thai_digits = "๐๑๒๓๔๕๖๗๘๙"  # 10
-thai_symbols = "\u0e3f"  # Thai Bath ฿
-
-# All Thai characters that presented in Unicode
-thai_characters = "".join(
-    [thai_letters, thai_punctuations, thai_digits, thai_symbols]
-)
-
-
-from pythainlp.soundex import soundex
-from pythainlp.spell import correct, spell
-from pythainlp.tag import pos_tag
-from pythainlp.tokenize import (
-    Tokenizer,
-    sent_tokenize,
-    subword_tokenize,
-    syllable_tokenize,
-    word_tokenize,
-)
-from pythainlp.transliterate import romanize, transliterate
-from pythainlp.util import collate, thai_strftime
+# -*- coding: utf-8 -*-
+__version__ = "2.2.7-dev0"
+
+thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"  # 44 chars
+
+thai_vowels = (
+    "\u0e24\u0e26\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37"
+    + "\u0e38\u0e39\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e4d\u0e47"
+)  # 20
+thai_lead_vowels = "\u0e40\u0e41\u0e42\u0e43\u0e44"  # 5
+thai_follow_vowels = "\u0e30\u0e32\u0e33\u0e45"  # 4
+thai_above_vowels = "\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47"  # 7
+thai_below_vowels = "\u0e38\u0e39"  # 2
+
+thai_tonemarks = "\u0e48\u0e49\u0e4a\u0e4b"  # 4
+
+# Paiyannoi, Maiyamok, Phinthu, Thanthakhat, Nikhahit, Yamakkan:
+# These signs can be part of a word
+thai_signs = "\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e"  # 6 chars
+
+# Any Thai character that can be part of a word
+thai_letters = "".join(
+    [thai_consonants, thai_vowels, thai_tonemarks, thai_signs]
+)  # 74
+
+# Fongman, Angkhankhu, Khomut:
+# These characters are section markers
+thai_punctuations = "\u0e4f\u0e5a\u0e5b"  # 3 chars
+
+thai_digits = "๐๑๒๓๔๕๖๗๘๙"  # 10
+thai_symbols = "\u0e3f"  # Thai Bath ฿
+
+# All Thai characters that presented in Unicode
+thai_characters = "".join(
+    [thai_letters, thai_punctuations, thai_digits, thai_symbols]
+)
+
+
+from pythainlp.soundex import soundex
+from pythainlp.spell import correct, spell
+from pythainlp.tag import pos_tag
+from pythainlp.tokenize import (
+    Tokenizer,
+    sent_tokenize,
+    subword_tokenize,
+    syllable_tokenize,
+    word_tokenize,
+)
+from pythainlp.transliterate import romanize, transliterate
+from pythainlp.util import collate, thai_strftime
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -87,9 +87,9 @@ def pos_tag(
         return []
 
     if engine == "perceptron":
-        from .perceptron import tag as tag_
+        from pythainlp.tag.perceptron import tag as tag_
     else:  # default, use "unigram" ("old") engine
-        from .unigram import tag as tag_
+        from pythainlp.tag.unigram import tag as tag_
 
     word_tags = tag_(words, corpus=corpus)
 

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -36,7 +36,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
         ['และ', 'คุณ', 'เล่น', 'มือถือ'],
         ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
     """
-    from .crfcls import segment
+    from pythainlp.tokenize.crfcls import segment
 
     return segment(doc)
 
@@ -132,35 +132,35 @@ def word_tokenize(
     segments = []
 
     if engine == "newmm" or engine == "onecut":
-        from .newmm import segment
+        from pythainlp.tokenize.newmm import segment
 
         segments = segment(text, custom_dict)
     elif engine == "newmm-safe":
-        from .newmm import segment
+        from pythainlp.tokenize.newmm import segment
 
         segments = segment(text, custom_dict, safe_mode=True)
     elif engine == "attacut":
-        from .attacut import segment
+        from pythainlp.tokenize.attacut import segment
 
         segments = segment(text)
     elif engine == "longest":
-        from .longest import segment
+        from pythainlp.tokenize.longest import segment
 
         segments = segment(text, custom_dict)
     elif engine == "mm" or engine == "multi_cut":
-        from .multi_cut import segment
+        from pythainlp.tokenize.multi_cut import segment
 
         segments = segment(text, custom_dict)
     elif engine == "deepcut":  # deepcut can optionally use dictionary
-        from .deepcut import segment
+        from pythainlp.tokenize.deepcut import segment
 
         if custom_dict:
             custom_dict = list(custom_dict)
             segments = segment(text, custom_dict)
         else:
             segments = segment(text)
     elif engine == "icu":
-        from .pyicu import segment
+        from pythainlp.tokenize.pyicu import segment
 
         segments = segment(text)
     else:
@@ -244,7 +244,7 @@ def sent_tokenize(
     segments = []
 
     if engine == "crfcut":
-        from .crfcut import segment
+        from pythainlp.tokenize.crfcut import segment
 
         segments = segment(text)
     elif engine == "whitespace":
@@ -325,9 +325,9 @@ def subword_tokenize(
         return []
 
     if engine == "tcc":
-        from .tcc import segment
+        from pythainlp.tokenize.tcc import segment
     elif engine == "etcc":
-        from .etcc import segment
+        from pythainlp.tokenize.etcc import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
@@ -394,7 +394,7 @@ def syllable_tokenize(
                 )
             )
     elif engine == "ssg":
-        from .ssg import segment
+        from pythainlp.tokenize.ssg import segment
 
         segments = segment(text)
     else:

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -20,7 +20,7 @@
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
 from pythainlp.util import Trie
 
-from .tcc import tcc_pos
+from pythainlp.tokenize.tcc import tcc_pos
 
 # match non-Thai tokens
 _PAT_NONTHAI = re.compile(

diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py
@@ -45,9 +45,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
         return ""
 
     if engine == "thai2rom":
-        from .thai2rom import romanize
+        from pythainlp.transliterate.thai2rom import romanize
     else:  # use default engine "royin"
-        from .royin import romanize
+        from pythainlp.transliterate.royin import romanize
 
     return romanize(text)
 
@@ -100,10 +100,10 @@ def transliterate(
         return ""
 
     if engine == "icu" or engine == "pyicu":
-        from .pyicu import transliterate
+        from pythainlp.transliterate.pyicu import transliterate
     elif engine == "thaig2p":
-        from .thaig2p import transliterate
+        from pythainlp.transliterate.thaig2p import transliterate
     else:
-        from .ipa import transliterate
+        from pythainlp.transliterate.ipa import transliterate
 
     return transliterate(text)
diff --git a/setup.cfg b/setup.cfg
@@ -1,32 +1,32 @@
-[bumpversion]
-current_version = 2.2.6
-commit = True
-tag = True
-parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
-serialize = 
-	{major}.{minor}.{patch}-{release}{build}
-	{major}.{minor}.{patch}
-
-[bumpversion:part:release]
-optional_value = prod
-first_value = dev
-values = 
-	dev
-	beta
-	prod
-
-[bumpversion:part:build]
-
-[bumpversion:file:setup.py]
-search = version={current_version}
-replace = {new_version}
-
-[bumpversion:file:pythainlp/__init__.py]
-search = __version__={current_version}
-replace = {new_version}
-
-[metadata]
-description-file = README.md
-
-[coverage:run]
-source = pythainlp
+[bumpversion]
+current_version = 2.2.7-dev0
+commit = True
+tag = True
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{build}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = prod
+first_value = dev
+values = 
+	dev
+	beta
+	prod
+
+[bumpversion:part:build]
+
+[bumpversion:file:setup.py]
+search = version="{current_version}"
+replace = version="{new_version}"
+
+[bumpversion:file:pythainlp/__init__.py]
+search = __version__ = "{current_version}"
+replace = __version__ = "{new_version}"
+
+[metadata]
+description-file = README.md
+
+[coverage:run]
+source = pythainlp