diff --git a/.travis.yml b/.travis.yml index 2cb939872..4e9d8b6da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,7 @@ before_install: - sudo rm -f /etc/boto.cfg install: -- pip install "tensorflow>=1.14,<2" deepcut +- pip install "tensorflow>=2,<3" deepcut - pip install -r requirements.txt - pip install .[full] - pip install coveralls diff --git a/README.md b/README.md index a3f606330..a820eeda1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp) [![Build status](https://ci.appveyor.com/api/projects/status/9g3mfcwchi8em40x?svg=true)](https://ci.appveyor.com/project/wannaphongcom/pythainlp-9y1ch) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade) -[![Coverage Status](https://coveralls.io/repos/github/PyThaiNLP/pythainlp/badge.svg?branch=dev)](https://coveralls.io/github/PyThaiNLP/pythainlp?branch=dev) [![Google Colab Badge](https://badgen.net/badge/Launch%20Quick%20Start%20Guide/on%20Google%20Colab/blue?icon=terminal)](https://colab.research.google.com/github/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb) +[![Coverage Status](https://coveralls.io/repos/github/PyThaiNLP/pythainlp/badge.svg?branch=dev)](https://coveralls.io/github/PyThaiNLP/pythainlp?branch=dev) [![Google Colab Badge](https://badgen.net/badge/Launch%20Quick%20Start%20Guide/on%20Google%20Colab/blue?icon=terminal)](https://colab.research.google.com/github/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp_get_started.ipynb) [![DOI](https://zenodo.org/badge/61813823.svg)](https://zenodo.org/badge/latestdoi/61813823) Thai Natural Language Processing in Python. @@ -24,7 +24,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil **This is a document for development branch (post 2.0). Things will break.** - The latest stable release is [2.0.7](https://github.com/PyThaiNLP/pythainlp/releases) -- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181). +- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181). - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page @@ -89,7 +89,7 @@ The data location can be changed, using `PYTHAINLP_DATA_DIR` environment variabl ## Documentation -- [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb) +- [PyThaiNLP Get Started](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html) - More tutorials at [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/) - See full documentation at [https://thainlp.org/pythainlp/docs/2.0/](https://thainlp.org/pythainlp/docs/2.0/) @@ -198,7 +198,7 @@ pip install pythainlp[extra1,extra2,...] ## เอกสารการใช้งาน -- [notebook เริ่มต้นใช้งาน PyThaiNLP](https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb) +- [เริ่มต้นใช้งาน PyThaiNLP](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html) - สอนการใช้งานเพิ่มเติม ในรูปแบบ notebook [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/) - เอกสารตัวเต็ม [https://thainlp.org/pythainlp/docs/2.0/](https://thainlp.org/pythainlp/docs/2.0/) diff --git a/appveyor.yml b/appveyor.yml index b5bb5d220..d0e7a45c0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -98,8 +98,8 @@ install: - pip --version - pip install coveralls[yaml] - pip install coverage - - pip install "tensorflow>=1.14,<2" deepcut - - pip install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + - pip install "tensorflow>=2,<3" deepcut + - pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install %PYICU_PKG% - pip install %ARTAGGER_PKG% - pip install -e .[full] diff --git a/docs/notes/pythainlp-1_7-2_0.rst b/docs/notes/pythainlp-1_7-2_0.rst deleted file mode 100644 index b3ef6d408..000000000 --- a/docs/notes/pythainlp-1_7-2_0.rst +++ /dev/null @@ -1,96 +0,0 @@ -From PyThaiNLP 1.7 to PyThaiNLP 2.0 -=================================== - -Sentiment Analysis ------------------- - -We are removing sentiment analysis in PyThaiNLP 2.0 -https://github.com/PyThaiNLP/pythainlp/issues/172#issuecomment-457456966 - -Soundex -------- - -- from ``pythainlp.soundex.LK82`` to ``pythainlp.soundex.lk82`` -- from ``pythainlp.soundex.Udom83`` to ``pythainlp.soundex.udom83`` - -Romanization ------------- - -- from ``pythainlp.romanization.romanization`` to - ``pythainlp.transliterate.romanize`` - -collation ---------- - -from ``pythainlp.collation.collation`` to ``pythainlp.util.collate`` - -change ------- - -- from ``pythainlp.change.texttothai`` to - ``pythainlp.util.thai_to_eng`` -- from ``pythainlp.change.texttoeng`` to ``pythainlp.util.eng_to_thai`` - -rank ----- - -from ``pythainlp.rank.rank`` to ``pythainlp.util.rank`` - -number ------- - -- from ``pythainlp.number.thai_num_to_num`` to - ``pythainlp.util.thai_digit_to_arabic_digit`` -- from ``pythainlp.number.num_to_thai_num`` to - ``pythainlp.util.arabic_digit_to_thai_digit`` -- from ``pythainlp.number.num_to_text`` to - ``pythainlp.util.num_to_thaiword`` -- from ``pythainlp.number.text_to_num`` to - ``pythainlp.util.text_to_arabic_digit`` -- from ``pythainlp.number.numtowords`` to - ``pythainlp.util.num_to_thaiword`` - -Named Entity Recognition ------------------------- - -from ``pythainlp.ner.thainer`` to -``pythainlp.tag.named_entity.ThaiNameTagger`` - -MetaSound ---------- - -from ``pythainlp.MetaSound.MetaSound(name)`` to -``pythainlp.soundex.metasound(name)`` - -Corpus ------- - -stopword -~~~~~~~~ - -from ``pythainlp.corpus.stopwords.words("thai")`` to -``pythainlp.corpus.common.thai_stopwords()`` - -Tone in Thai -~~~~~~~~~~~~ - -from ``pythainlp.corpus.tone.get_data()`` to -``pythainlp.thai_tonemarks`` - -Consonant in thai -~~~~~~~~~~~~~~~~~ - -from ``pythainlp.corpus.alphabet.get_data()`` to -``pythainlp.thai_consonants`` - -Word list in thai -~~~~~~~~~~~~~~~~~ - -from ``pythainlp.corpus.thaiword.get_data()`` to -``pythainlp.corpus.thai_words()`` - -Thai country name -~~~~~~~~~~~~~~~~~ - -from ``pythainlp.corpus.country.get_data()`` to -``pythainlp.corpus.countries()`` diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index d7fa5101d..6199b24b4 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -18,7 +18,7 @@ _CORPUS_DB_URL = ( "https://raw.githubusercontent.com/" + "PyThaiNLP/pythainlp-corpus/" - + "master/db.json" + + "2.1/db.json" ) _CORPUS_DB_FILENAME = "db.json" @@ -165,12 +165,12 @@ def _check_hash(dst: str, md5: str) -> NoReturn: @param: md5 place to hash the file (MD5) """ if md5 and md5 != "-": - f = open(get_full_data_path(dst), "rb") - content = f.read() - file_md5 = hashlib.md5(content).hexdigest() + with open(get_full_data_path(dst), "rb") as f: + content = f.read() + file_md5 = hashlib.md5(content).hexdigest() - if md5 != file_md5: - raise Exception("Hash does not match expected.") + if md5 != file_md5: + raise Exception("Hash does not match expected.") def download(name: str, force: bool = False) -> NoReturn: diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index b6690018d..112327002 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -76,10 +76,10 @@ def __init__(self): """ Thai named-entity recognizer """ - self.__data_path = get_corpus_path("thainer-1-2") + self.__data_path = get_corpus_path("thainer-1-3") if not self.__data_path: - download("thainer-1-2") - self.__data_path = get_corpus_path("thainer-1-2") + download("thainer-1-3") + self.__data_path = get_corpus_path("thainer-1-3") self.crf = sklearn_crfsuite.CRF( algorithm="lbfgs", c1=0.1, diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 06861406c..cbc36d2a0 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -33,6 +33,8 @@ def word_tokenize( **Options for engine** * *newmm* (default) - dictionary-based, Maximum Matching + Thai Character Cluster + * *newmm-safe* - newmm, with a mechanism to avoid long + processing time for some long continuous text without spaces * *longest* - dictionary-based, Longest Matching * *icu* - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based @@ -101,10 +103,15 @@ def word_tokenize( return [] segments = [] + if engine == "newmm" or engine == "onecut": from .newmm import segment segments = segment(text, custom_dict) + elif engine == "newmm-safe": + from .newmm import segment + + segments = segment(text, custom_dict, safe_mode=True) elif engine == "attacut": from .attacut import segment @@ -157,6 +164,7 @@ def dict_word_tokenize( :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai :return: list of words + :rtype: list[str] """ warnings.warn( "dict_word_tokenize is deprecated. Use word_tokenize with a custom_dict argument instead.", @@ -336,6 +344,7 @@ def syllable_tokenize(text: str, engine: str = "default") -> List[str]: tokens.extend(word_tokenize(text=word, custom_dict=trie)) else: from .ssg import segment + tokens = segment(text) return tokens @@ -345,9 +354,10 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: """ Create a dictionary trie which will be used for word_tokenize() function. - :param string/list dict_source: a list of vocaburaries or a path - to source file - :return: a trie created from a dictionary input + :param str|Iterable[str]|pythainlp.tokenize.Trie dict_source: a path to + dictionary file or a list of words or a pythainlp.tokenize.Trie object + :return: a trie object created from a dictionary input + :rtype: pythainlp.tokenize.Trie """ trie = None @@ -359,7 +369,9 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: _vocabs = f.read().splitlines() trie = Trie(_vocabs) elif isinstance(dict_source, Iterable): - # Note: Trie and str are both Iterable, Iterable check should be here + # Note: Since Trie and str are both Iterable, + # so the Iterable check should be here, at the very end, + # because it has less specificality # Received a sequence type object of vocabs trie = Trie(dict_source) else: @@ -435,7 +447,9 @@ class Tokenizer: """ def __init__( - self, custom_dict: Union[Trie, Iterable[str], str] = None, engine: str = "newmm" + self, + custom_dict: Union[Trie, Iterable[str], str] = None, + engine: str = "newmm", ): """ Initialize tokenizer object @@ -458,7 +472,9 @@ def word_tokenize(self, text: str) -> List[str]: :return: list of words, tokenized from the text :rtype: list[str] """ - return word_tokenize(text, custom_dict=self.__trie_dict, engine=self.__engine) + return word_tokenize( + text, custom_dict=self.__trie_dict, engine=self.__engine + ) def set_tokenize_engine(self, engine: str) -> None: """ diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index eaf59239d..456e03676 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Enhanced Thai Character Cluster (ETCC) +Enhanced Thai Character Cluster (ETCC) (In progress) Python implementation by Wannaphong Phatthiyaphaibun (19 June 2017) :See Also: @@ -75,5 +75,4 @@ def segment(text: str) -> str: text = re.sub(i, ii + "/", text) text = re.sub("//", "/", text) - return text.split("/") diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index a5bf0fc58..fbf3dacf3 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -14,17 +14,17 @@ import re from collections import defaultdict from heapq import heappop, heappush # for priority queue -from typing import Iterable, List +from typing import Generator, List from pythainlp.tokenize import DEFAULT_DICT_TRIE from .tcc import tcc_pos from .trie import Trie -# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น +# To tokenize English words, for example _PAT_ENG = re.compile( r"""(?x) -[-a-zA-Z]+| # english +[-a-zA-Z]+| # Latin \d[\d,\.]*| # number [ \t]+| # space \r?\n # newline @@ -33,8 +33,14 @@ _PAT_TWOCHARS = re.compile("[ก-ฮ]{,2}$") +_TEXT_LIMIT = 120 +_TEXT_SCAN_LEFT = 20 +_TEXT_SCAN_RIGHT = 20 -def _bfs_paths_graph(graph, start, goal): + +def _bfs_paths_graph( + graph: defaultdict, start: int, goal: List[int] +) -> Generator[List[int], None, None]: queue = [(start, [start])] while queue: (vertex, path) = queue.pop(0) @@ -45,7 +51,7 @@ def _bfs_paths_graph(graph, start, goal): queue.append((next, path + [next])) -def _onecut(text: str, custom_dict: Trie) -> Iterable[str]: +def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]: graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # separating position should aligned with TCC @@ -56,28 +62,28 @@ def _onecut(text: str, custom_dict: Trie) -> Iterable[str]: for w in custom_dict.prefixes(text[p:]): p_ = p + len(w) - if p_ in allow_pos: # เลือกที่สอดคล้อง tcc + if p_ in allow_pos: # only pick one that is TCC-valid graph[p].append(p_) if p_ not in q: heappush(q, p_) - # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ + # if length == 1 means no longer ambiguous, return previous result if len(q) == 1: pp = next(_bfs_paths_graph(graph, last_p, q[0])) - # เริ่มต้น last_p = pp[0] เอง + # will eventually start at last_p = pp[0] for p in pp[1:]: yield text[last_p:p] last_p = p - # สุดท้าย last_p == q[0] เอง + # will eventually stop at last_p == q[0] - # กรณี length 0 คือ ไม่มีใน dict + # if length == 0 means not found in dictionary if len(q) == 0: m = _PAT_ENG.match(text[p:]) - if m: # อังกฤษ, เลข, ว่าง + if m: # Latin characters, numeric, space i = p + m.end() - else: # skip น้อยที่สุด ที่เป็นไปได้ + else: # as mininum skip as possible for i in range(p + 1, len(text)): - if i in allow_pos: # ใช้ tcc ด้วย + if i in allow_pos: # only if TCC-valid ww = [ w for w in custom_dict.prefixes(text[i:]) @@ -96,12 +102,17 @@ def _onecut(text: str, custom_dict: Trie) -> Iterable[str]: heappush(q, i) -def segment(text: str, custom_dict: Trie = None) -> List[str]: +def segment( + text: str, custom_dict: Trie = DEFAULT_DICT_TRIE, safe_mode: bool = False +) -> List[str]: """ Dictionary-based maximal matching word segmentation, constrained with Thai Character Cluster boundaries. :param str text: text to be tokenized to words + :param pythainlp.trie.Trie custom_dict: dictionary for tokenization + :param bool safe_mode: True to avoid long wait for long continuous text\ + (edge case); Default is False :return: list of words, tokenized from the text """ if not text or not isinstance(text, str): @@ -110,4 +121,57 @@ def segment(text: str, custom_dict: Trie = None) -> List[str]: if not custom_dict: custom_dict = DEFAULT_DICT_TRIE - return list(_onecut(text, custom_dict)) + if not safe_mode: + return list(_onecut(text, custom_dict)) + + text_len = len(text) + + if text_len < (_TEXT_LIMIT + _TEXT_SCAN_RIGHT): + # if the text is shorter than the limit, + # tokenizes the whole text at once + return list(_onecut(text, custom_dict)) + else: + # if the text is longer than the limit, + # breaks them into smaller chunks then tokenizes each chunk + text_parts = [] + + while text_len >= (_TEXT_LIMIT + _TEXT_SCAN_RIGHT): + sample_start = _TEXT_LIMIT - _TEXT_SCAN_LEFT + sample_end = _TEXT_LIMIT + _TEXT_SCAN_RIGHT + sample = text[sample_start:sample_end] + + # find possible break positions + cut_pos = sample_end + + # try to break by space first + space_idx = sample.rfind(" ") + if space_idx >= 0: + cut_pos = space_idx + 1 + else: + tokens = list(_onecut(sample, custom_dict)) + token_max_idx = 0 + for i, token in enumerate(tokens): + token_max_len = 0 + if len(token) > token_max_len: + token_max_len = len(token) + token_max_idx = i + + # choose the position that covers longest token + cut_pos = sample_start + for i in range(0, token_max_idx): + cut_pos = cut_pos + len(tokens[i]) + + text_parts.append(text[:cut_pos]) + text = text[cut_pos:] + text_len = len(text) + + # append remaining text + if text_len: + text_parts.append(text) + + # tokenizes each text parts + tokens = [] + for text_part in text_parts: + tokens.extend(list(_onecut(text_part, custom_dict))) + + return tokens diff --git a/pythainlp/util/thai_time.py b/pythainlp/util/thai_time.py index 9c23d8e8d..7ea34735b 100644 --- a/pythainlp/util/thai_time.py +++ b/pythainlp/util/thai_time.py @@ -85,16 +85,16 @@ def _format( else: raise NotImplementedError(fmt) - if precision == "minute" or precision == "second": + if precision == "m" or precision == "s": if ( m == 30 - and (s == 0 or precision == "minute") + and (s == 0 or precision == "m") and (fmt == "6h" or fmt == "m6h") ): text += "ครึ่ง" else: text += num_to_thaiword(m) + "นาที" - if precision == "second": + if precision == "s": text += num_to_thaiword(s) + "วินาที" else: if m: @@ -124,8 +124,8 @@ def thai_time( * *6h* - 6-hour clock * *m6h* - Modified 6-hour clock :param str precision: precision of the spell out - * *minute* - always spell out to minute level - * *second* - always spell out to second level + * *m* - always spell out to minute level + * *s* - always spell out to second level * None - spell out only non-zero parts :return: Time spell out in Thai words :rtype: str @@ -152,7 +152,7 @@ def thai_time( # output: # สิบสองนาฬิกาสามนาที - thai_time(datetime.time(12, 3, 0), precision="second") + thai_time(datetime.time(12, 3, 0), precision="s") # output: # สิบสองนาฬิกาสามนาทีศูนย์วินาที """ diff --git a/pythainlp/word_vector/__init__.py b/pythainlp/word_vector/__init__.py index 8d365cd16..f7cb3ce6c 100644 --- a/pythainlp/word_vector/__init__.py +++ b/pythainlp/word_vector/__init__.py @@ -3,10 +3,11 @@ thai2fit - Thai word vector Code by https://github.com/cstorm125/thai2fit """ -from typing import List +from typing import List, Tuple import numpy as np from gensim.models import KeyedVectors +from gensim.models.keyedvectors import Word2VecKeyedVectors from pythainlp.corpus import download as download_data from pythainlp.corpus import get_corpus, get_corpus_path from pythainlp.tokenize import Tokenizer @@ -25,11 +26,12 @@ def _download() -> str: return path -def get_model(): +def get_model() -> Word2VecKeyedVectors: """ Download model - :return: `gensim` model + :return: `gensim` word2vec model + :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors """ return KeyedVectors.load_word2vec_format(_download(), binary=True) @@ -37,7 +39,9 @@ def get_model(): _MODEL = get_model() -def most_similar_cosmul(positive: List[str], negative: List[str]): +def most_similar_cosmul( + positive: List[str], negative: List[str] +) -> List[Tuple[str, float]]: """ This function find the top-10 words that are most similar with respect to from two lists of words labeled as positive and negative. @@ -209,7 +213,7 @@ def similarity(word1: str, word2: str) -> float: return _MODEL.similarity(word1, word2) -def sentence_vectorizer(text: str, use_mean: bool = True): +def sentence_vectorizer(text: str, use_mean: bool = True) -> np.ndarray: """ This function convert a Thai sentence into vector. Specifically, it first tokenize that text and map each tokenized words @@ -259,10 +263,8 @@ def sentence_vectorizer(text: str, use_mean: bool = True): elif word == "\n": word = "xxeol" - if word in _MODEL.wv.index2word: - vec += _MODEL.wv.word_vec(word) - else: - pass + if word in _MODEL.index2word: + vec += _MODEL.word_vec(word) if use_mean: vec /= len(words) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index d85eca5d4..77f8d2002 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -165,6 +165,7 @@ def test_word_tokenize_newmm(self): ), ["จุ๋ม", "ง่วง"], ) + long_text = """ ไต้หวัน (แป่ะเอ๋ยี้: Tâi-oân; ไต่อวัน) หรือ ไถวาน (อักษรโรมัน: Taiwan; จีนตัวย่อ: 台湾; จีนตัวเต็ม: 臺灣/台灣; พินอิน: Táiwān; ไถวาน) หรือชื่อทางการว่า สาธารณรัฐจีน (อังกฤษ: Republic of China; จีนตัวย่อ: 中华民国; จีนตัวเต็ม: 中華民國; พินอิน: Zhōnghuá Mínguó) เป็นรัฐในทวีปเอเชียตะวันออก[7][8][9] ปัจจุบันประกอบด้วยเกาะใหญ่ 5 แห่ง คือ จินเหมิน (金門), ไต้หวัน, เผิงหู (澎湖), หมาจู่ (馬祖), และอูชิว (烏坵) กับทั้งเกาะเล็กเกาะน้อยอีกจำนวนหนึ่ง ท้องที่ดังกล่าวเรียกรวมกันว่า "พื้นที่ไต้หวัน" (臺灣地區) ไต้หวันด้านตะวันตกติดกับจีนแผ่นดินใหญ่ ด้านตะวันออกและตะวันออกเฉียงเหนือติดกับญี่ปุ่น และด้านใต้ติดกับฟิลิปปินส์ กรุงไทเปเป็นเมืองหลวง[10] ส่วนไทเปใหม่เป็นเขตปกครองที่จัดตั้งขึ้นใหม่ กินพื้นที่กรุงไทเป และเป็นเขตซึ่งประชากรหนาแน่นที่สุดในเวลานี้ @@ -186,6 +187,20 @@ def test_word_tokenize_newmm(self): วันที่ 24 พฤษภาคม 2560 ศาลรัฐธรรมนูญวินิจฉัยว่ากฎหมายสมรสปัจจุบันในเวลานั้นละเมิดรัฐธรรมนูญโดยปฏิเสธสิทธิสมรสของคู่รักเพศเดียวกันชาวไต้หวัน ศาลวินิจฉัยว่าหากสภานิติบัญญัติไม่ผ่านการแก้ไขกฎหมายที่เพียงพอต่อกฎหมายสมรสของไต้หวันภายในสองปี การสมรสเพศเดียวกันจะชอบด้วยกฎหมายโดยอัตโนมัติในไต้หวัน[17] วันที่ 17 พฤษภาคม 2562 สภานิติบัญญัติไต้หวันอนุมัติร่างกฎหมายทำให้การสมรสเพศเดียวกันชอบด้วยกฎหมาย ทำให้เป็นประเทศแรกในทวีปเอเชียที่ผ่านกฎหมายดังกล่าว[18][19] """ self.assertIsNotNone(word_tokenize(long_text, engine="newmm")) + self.assertIsNotNone(word_tokenize(long_text, engine="newmm-safe")) + + short_danger_text = """ + ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน + """ + long_danger_text = """ + ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก + """ + self.assertIsNotNone( + word_tokenize(short_danger_text, engine="newmm-safe") + ) + self.assertIsNotNone( + word_tokenize(long_danger_text, engine="newmm-safe") + ) def test_word_tokenize_attacut(self): self.assertEqual(attacut.segment(None), []) @@ -228,6 +243,9 @@ def test_syllable_tokenize(self): self.assertEqual( syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] ) + + self.assertEqual(syllable_tokenize(None, engine="ssg"), []) + self.assertEqual(syllable_tokenize("", engine="ssg"), []) self.assertEqual( syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] ) diff --git a/tests/test_util.py b/tests/test_util.py index 22b6a3530..358f27481 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -172,7 +172,7 @@ def test_thai_time(self): self.assertEqual(thai_time("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที") self.assertEqual(thai_time("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที") self.assertEqual(thai_time("18:30", "m6h"), "หกโมงครึ่ง") - self.assertEqual(thai_time("13:30:01", "6h", "minute"), "บ่ายโมงครึ่ง") + self.assertEqual(thai_time("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง") self.assertEqual( thai_time(datetime.time(12, 3, 0)), "สิบสองนาฬิกาสามนาที" ) @@ -182,19 +182,19 @@ def test_thai_time(self): ) self.assertEqual( thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 0), precision="second" + datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s" ), "สิบสองนาฬิกาสามนาทีศูนย์วินาที", ) self.assertEqual( thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 1), precision="minute" + datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m" ), "สิบสองนาฬิกาสามนาที", ) self.assertEqual( thai_time( - datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "minute" + datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m" ), "เที่ยงครึ่ง", ) @@ -210,10 +210,19 @@ def test_thai_time(self): self.assertIsNotNone(thai_time("13:30")) self.assertIsNotNone(thai_time("13:30", "6h")) self.assertIsNotNone(thai_time("13:30", "m6h")) + self.assertIsNotNone(thai_time("15:30")) + self.assertIsNotNone(thai_time("15:30", "6h")) + self.assertIsNotNone(thai_time("15:30", "m6h")) + self.assertIsNotNone(thai_time("18:30")) + self.assertIsNotNone(thai_time("18:30", "6h")) + self.assertIsNotNone(thai_time("18:30", "m6h")) self.assertIsNotNone(thai_time("19:30")) self.assertIsNotNone(thai_time("19:30", "6h")) self.assertIsNotNone(thai_time("19:30", "m6h")) + with self.assertRaises(NotImplementedError): + thai_time("8:17", fmt="xx") + # ### pythainlp.util.normalize def test_delete_tone(self):