diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 6199b24b4..2c08a1a29 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -44,8 +44,9 @@ def corpus_db_path() -> str: def get_corpus_db_detail(name: str) -> dict: db = TinyDB(corpus_db_path()) query = Query() - res = db.search(query.name == name) + db.close() + if res: return res[0] else: @@ -286,8 +287,10 @@ def remove(name: str) -> bool: path = get_corpus_path(name) os.remove(path) db.remove(query.name == name) + db.close() return True + db.close() return False diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 2e2c9c127..4640d0f7c 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -5,11 +5,11 @@ """ from typing import List -import attacut +from attacut import tokenize def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - return attacut.tokenize(text) + return tokenize(text) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 5cfd07009..47446d7c8 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -9,7 +9,7 @@ from typing import List, Union -import deepcut +from deepcut import tokenize from .trie import Trie @@ -22,6 +22,6 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[ if isinstance(custom_dict, Trie): custom_dict = list(custom_dict) - return deepcut.tokenize(text, custom_dict) + return tokenize(text, custom_dict) - return deepcut.tokenize(text) + return tokenize(text) diff --git a/pythainlp/tokenize/ssg.py b/pythainlp/tokenize/ssg.py index 37bf19880..94e02bdf7 100644 --- a/pythainlp/tokenize/ssg.py +++ b/pythainlp/tokenize/ssg.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from typing import List -import ssg +from ssg import syllable_tokenize def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - return ssg.syllable_tokenize(text) + return syllable_tokenize(text) diff --git a/setup.py b/setup.py index b5175f11b..2d9c74efb 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,13 @@ Some functionalities, like named-entity recognition, required extra packages. See https://github.com/PyThaiNLP/pythainlp for installation options. + + +Made with ❤️ + +PyThaiNLP Team + +"We build Thai NLP" """ requirements = [ @@ -63,7 +70,7 @@ setup( name="pythainlp", - version="2.1.dev7", + version="2.1.dev8", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/test_corpus.py b/tests/test_corpus.py index fe9cebb96..43fc3ee4c 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -35,10 +35,12 @@ def test_corpus(self): self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) + self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) + self.assertFalse(remove("test")) def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) @@ -48,6 +50,7 @@ def test_ttc(self): def test_wordnet(self): self.assertIsNotNone(wordnet.langs()) + self.assertTrue("tha" in wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] @@ -69,6 +72,9 @@ def test_wordnet(self): self.assertEqual( wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) ) + self.assertEqual( + wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse) + ) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key)) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 77f8d2002..d47b966b3 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -26,9 +26,6 @@ class TestTokenizePackage(unittest.TestCase): - def test_dict_word_tokenize(self): - self.assertEqual(dict_word_tokenize(""), []) - def test_etcc(self): self.assertEqual(etcc.segment(""), "") self.assertIsInstance(etcc.segment("คืนความสุข"), list) @@ -61,24 +58,34 @@ def test_word_tokenize(self): self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut") ) - self.assertIsNotNone( - word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") - ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") ) + self.assertIsNotNone( + word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") + ) # XX engine is not existed self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) + self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE)) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) ) - self.assertIsNotNone( - word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE) + self.assertTrue( + "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) + ) + + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE) + self.assertEqual( + word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), + dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), ) def test_Tokenizer(self): @@ -224,18 +231,30 @@ def test_sent_tokenize(self): def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) self.assertEqual(subword_tokenize(""), []) + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc")) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") + ) self.assertEqual(subword_tokenize(None, engine="etcc"), []) self.assertEqual(subword_tokenize("", engine="etcc"), []) self.assertIsNotNone( subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc") ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") + ) self.assertIsNotNone(subword_tokenize("เบียร์สิงห์", engine="etcc")) self.assertEqual(subword_tokenize(None, engine="ssg"), []) self.assertEqual(subword_tokenize("", engine="ssg"), []) - self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) + self.assertTrue( + "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) def test_syllable_tokenize(self): self.assertEqual(syllable_tokenize(None), []) @@ -243,12 +262,14 @@ def test_syllable_tokenize(self): self.assertEqual( syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] ) + self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) self.assertEqual(syllable_tokenize(None, engine="ssg"), []) self.assertEqual(syllable_tokenize("", engine="ssg"), []) self.assertEqual( syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] ) + self.assertFalse("า" in syllable_tokenize("แมวกินปลา", engine="etcc")) def test_tcc(self): self.assertEqual(tcc.segment(None), []) diff --git a/tests/test_util.py b/tests/test_util.py index 358f27481..11ed7f171 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -168,10 +168,10 @@ def test_thai_strftime(self): # ### pythainlp.util.thai_time def test_thai_time(self): + self.assertEqual(thai_time("8:17"), thai_time("08:17")) self.assertEqual(thai_time("8:17"), "แปดนาฬิกาสิบเจ็ดนาที") self.assertEqual(thai_time("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที") self.assertEqual(thai_time("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที") - self.assertEqual(thai_time("18:30", "m6h"), "หกโมงครึ่ง") self.assertEqual(thai_time("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง") self.assertEqual( thai_time(datetime.time(12, 3, 0)), "สิบสองนาฬิกาสามนาที" @@ -181,23 +181,38 @@ def test_thai_time(self): "สิบสองนาฬิกาสามนาทีหนึ่งวินาที", ) self.assertEqual( - thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s" - ), + thai_time(datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"), "สิบสองนาฬิกาสามนาทีศูนย์วินาที", ) self.assertEqual( - thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m" - ), + thai_time(datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"), "สิบสองนาฬิกาสามนาที", ) self.assertEqual( - thai_time( - datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m" - ), + thai_time(datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"), "เที่ยงครึ่ง", ) + self.assertEqual(thai_time("18:30"), "สิบแปดนาฬิกาสามสิบนาที") + self.assertEqual(thai_time("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที") + self.assertEqual( + thai_time("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที" + ) + self.assertEqual( + thai_time("18:30:01", precision="m"), "สิบแปดนาฬิกาสามสิบนาที" + ) + self.assertEqual( + thai_time("18:30:01", precision="s"), + "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที", + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง" + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h"), "หกโมงสามสิบนาทีหนึ่งวินาที" + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง" + ) self.assertIsNotNone(thai_time("0:30")) self.assertIsNotNone(thai_time("0:30", "6h")) self.assertIsNotNone(thai_time("0:30", "m6h")) @@ -228,7 +243,12 @@ def test_thai_time(self): def test_delete_tone(self): self.assertEqual(delete_tone("จิ้น"), "จิน") self.assertEqual(delete_tone("เก๋า"), "เกา") - self.assertEqual(delete_tone("จิ้น"), deletetone("จิ้น")) + + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # deletetone("จิ้น") + self.assertEqual(deletetone("จิ้น"), delete_tone("จิ้น")) def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") @@ -256,7 +276,6 @@ def test_isthai(self): self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True) def test_is_native_thai(self): - self.assertEqual(is_native_thai("เลข"), thaicheck("เลข")) self.assertEqual(is_native_thai(None), False) self.assertEqual(is_native_thai(""), False) self.assertEqual(is_native_thai("116"), False) @@ -276,3 +295,9 @@ def test_is_native_thai(self): self.assertEqual(is_native_thai("เลข"), False) self.assertEqual(is_native_thai("เทเวศน์"), False) self.assertEqual(is_native_thai("เทเวศร์"), False) + + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # thaicheck("เลข") + self.assertEqual(thaicheck("เลข"), is_native_thai("เลข")) diff --git a/tox.ini b/tox.ini index e5573e5b5..2f7962f7b 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,7 @@ setenv = commands = python setup.py test -; If you want to make tox run the tests with the same versions, create a +; If you want to make tox run the tests with the same versions, create a ; requirements.txt with the pinned versions and uncomment the following lines: ; deps = ; -r{toxinidir}/requirements.txt