From 64ab1feab70d8f619ab8cc0800f13449a03b0b95 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 09:43:19 +0000 Subject: [PATCH 01/11] more tests for corpus functions --- tests/test_corpus.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index fe9cebb96..43fc3ee4c 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -35,10 +35,12 @@ def test_corpus(self): self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) + self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) + self.assertFalse(remove("test")) def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) @@ -48,6 +50,7 @@ def test_ttc(self): def test_wordnet(self): self.assertIsNotNone(wordnet.langs()) + self.assertTrue("tha" in wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] @@ -69,6 +72,9 @@ def test_wordnet(self): self.assertEqual( wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) ) + self.assertEqual( + wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse) + ) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key)) From dd122774294a4d575853bfd11ca78d6bd178ddee Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 09:52:28 +0000 Subject: [PATCH 02/11] minimize imports --- pythainlp/tokenize/attacut.py | 4 ++-- pythainlp/tokenize/deepcut.py | 6 +++--- pythainlp/tokenize/ssg.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 2e2c9c127..4640d0f7c 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -5,11 +5,11 @@ """ from typing import List -import attacut +from attacut import tokenize def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - return attacut.tokenize(text) + return tokenize(text) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 5cfd07009..47446d7c8 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -9,7 +9,7 @@ from typing import List, Union -import deepcut +from deepcut import tokenize from .trie import Trie @@ -22,6 +22,6 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[ if isinstance(custom_dict, Trie): custom_dict = list(custom_dict) - return deepcut.tokenize(text, custom_dict) + return tokenize(text, custom_dict) - return deepcut.tokenize(text) + return tokenize(text) diff --git a/pythainlp/tokenize/ssg.py b/pythainlp/tokenize/ssg.py index 37bf19880..94e02bdf7 100644 --- a/pythainlp/tokenize/ssg.py +++ b/pythainlp/tokenize/ssg.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from typing import List -import ssg +from ssg import syllable_tokenize def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] - return ssg.syllable_tokenize(text) + return syllable_tokenize(text) From cd443162b591b23812ad408a2783bba935236cee Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 10:04:15 +0000 Subject: [PATCH 03/11] more test cases for thai_time() --- tests/test_tokenize.py | 2 +- tests/test_util.py | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 77f8d2002..8fa153ccd 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -235,7 +235,7 @@ def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None, engine="ssg"), []) self.assertEqual(subword_tokenize("", engine="ssg"), []) - self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) + self.assertTrue("ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) def test_syllable_tokenize(self): self.assertEqual(syllable_tokenize(None), []) diff --git a/tests/test_util.py b/tests/test_util.py index 358f27481..4d8ebe820 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -168,10 +168,10 @@ def test_thai_strftime(self): # ### pythainlp.util.thai_time def test_thai_time(self): + self.assertEqual(thai_time("8:17"), thai_time("08:17")) self.assertEqual(thai_time("8:17"), "แปดนาฬิกาสิบเจ็ดนาที") self.assertEqual(thai_time("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที") self.assertEqual(thai_time("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที") - self.assertEqual(thai_time("18:30", "m6h"), "หกโมงครึ่ง") self.assertEqual(thai_time("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง") self.assertEqual( thai_time(datetime.time(12, 3, 0)), "สิบสองนาฬิกาสามนาที" @@ -181,23 +181,38 @@ def test_thai_time(self): "สิบสองนาฬิกาสามนาทีหนึ่งวินาที", ) self.assertEqual( - thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s" - ), + thai_time(datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"), "สิบสองนาฬิกาสามนาทีศูนย์วินาที", ) self.assertEqual( - thai_time( - datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m" - ), + thai_time(datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"), "สิบสองนาฬิกาสามนาที", ) self.assertEqual( - thai_time( - datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m" - ), + thai_time(datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"), "เที่ยงครึ่ง", ) + self.assertEqual(thai_time("18:30"), "สิบแปดนาฬิกาสามสิบนาที") + self.assertEqual(thai_time("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที") + self.assertEqual( + thai_time("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที" + ) + self.assertEqual( + thai_time("18:30:01", precision="m"), "สิบแปดนาฬิกาสามสิบนาที" + ) + self.assertEqual( + thai_time("18:30:01", precision="s"), + "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที", + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง" + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h"), "หกโมงสามสิบนาทีหนึ่งวินาที" + ) + self.assertEqual( + thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง" + ) self.assertIsNotNone(thai_time("0:30")) self.assertIsNotNone(thai_time("0:30", "6h")) self.assertIsNotNone(thai_time("0:30", "m6h")) From 01f5fa839aa24d36b0115d8c7f00c99dc8a07cbe Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 10:08:50 +0000 Subject: [PATCH 04/11] more semantic test cases --- tests/test_tokenize.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 8fa153ccd..e67f2a3da 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -224,18 +224,30 @@ def test_sent_tokenize(self): def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) self.assertEqual(subword_tokenize(""), []) + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc")) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") + ) self.assertEqual(subword_tokenize(None, engine="etcc"), []) self.assertEqual(subword_tokenize("", engine="etcc"), []) self.assertIsNotNone( subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc") ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") + ) self.assertIsNotNone(subword_tokenize("เบียร์สิงห์", engine="etcc")) self.assertEqual(subword_tokenize(None, engine="ssg"), []) self.assertEqual(subword_tokenize("", engine="ssg"), []) - self.assertTrue("ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) + self.assertTrue( + "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) def test_syllable_tokenize(self): self.assertEqual(syllable_tokenize(None), []) @@ -243,12 +255,14 @@ def test_syllable_tokenize(self): self.assertEqual( syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] ) + self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) self.assertEqual(syllable_tokenize(None, engine="ssg"), []) self.assertEqual(syllable_tokenize("", engine="ssg"), []) self.assertEqual( syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] ) + self.assertFalse("า" in syllable_tokenize("แมวกินปลา", engine="etcc")) def test_tcc(self): self.assertEqual(tcc.segment(None), []) From 19e743dd2cb9595cc067051cc42819ba7c84c722 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 10:30:17 +0000 Subject: [PATCH 05/11] test if deprecated function raises DeprecationWarning --- tests/test_tokenize.py | 6 +++--- tests/test_util.py | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index e67f2a3da..fa85f7351 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -26,9 +26,6 @@ class TestTokenizePackage(unittest.TestCase): - def test_dict_word_tokenize(self): - self.assertEqual(dict_word_tokenize(""), []) - def test_etcc(self): self.assertEqual(etcc.segment(""), "") self.assertIsInstance(etcc.segment("คืนความสุข"), list) @@ -81,6 +78,9 @@ def test_word_tokenize(self): word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE) ) + with self.assertRaises(DeprecationWarning): + dict_word_tokenize("เลิกใช้แล้ว") + def test_Tokenizer(self): t_test = Tokenizer(DEFAULT_DICT_TRIE) self.assertEqual(t_test.word_tokenize(""), []) diff --git a/tests/test_util.py b/tests/test_util.py index 4d8ebe820..875338833 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -243,7 +243,9 @@ def test_thai_time(self): def test_delete_tone(self): self.assertEqual(delete_tone("จิ้น"), "จิน") self.assertEqual(delete_tone("เก๋า"), "เกา") - self.assertEqual(delete_tone("จิ้น"), deletetone("จิ้น")) + + with self.assertRaises(DeprecationWarning): + deletetone("จิ้น") def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") @@ -271,7 +273,6 @@ def test_isthai(self): self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True) def test_is_native_thai(self): - self.assertEqual(is_native_thai("เลข"), thaicheck("เลข")) self.assertEqual(is_native_thai(None), False) self.assertEqual(is_native_thai(""), False) self.assertEqual(is_native_thai("116"), False) @@ -291,3 +292,6 @@ def test_is_native_thai(self): self.assertEqual(is_native_thai("เลข"), False) self.assertEqual(is_native_thai("เทเวศน์"), False) self.assertEqual(is_native_thai("เทเวศร์"), False) + + with self.assertRaises(DeprecationWarning): + thaicheck("เลข") From ef80dc3ecd83bed6d3c72fcc453f61a21dea2c55 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 10:33:25 +0000 Subject: [PATCH 06/11] close db --- pythainlp/corpus/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 6199b24b4..9b0e406e0 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -44,8 +44,9 @@ def corpus_db_path() -> str: def get_corpus_db_detail(name: str) -> dict: db = TinyDB(corpus_db_path()) query = Query() - res = db.search(query.name == name) + db.close() + if res: return res[0] else: @@ -281,6 +282,7 @@ def remove(name: str) -> bool: db = TinyDB(corpus_db_path()) query = Query() data = db.search(query.name == name) + db.close() if data: path = get_corpus_path(name) From 574885f60867d47c2d19a8fde55cf596afbf0256 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 12:25:21 +0000 Subject: [PATCH 07/11] catch deprecation warnings --- pythainlp/corpus/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 9b0e406e0..2c08a1a29 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -282,14 +282,15 @@ def remove(name: str) -> bool: db = TinyDB(corpus_db_path()) query = Query() data = db.search(query.name == name) - db.close() if data: path = get_corpus_path(name) os.remove(path) db.remove(query.name == name) + db.close() return True + db.close() return False From 251b189f61cb53a521142a367d83d547970acbe0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 12:25:49 +0000 Subject: [PATCH 08/11] catch deprecation warnings --- tests/test_tokenize.py | 2 +- tests/test_util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index fa85f7351..f7c96bb38 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -78,7 +78,7 @@ def test_word_tokenize(self): word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE) ) - with self.assertRaises(DeprecationWarning): + with self.assertWarns(DeprecationWarning): dict_word_tokenize("เลิกใช้แล้ว") def test_Tokenizer(self): diff --git a/tests/test_util.py b/tests/test_util.py index 875338833..3461cab29 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -244,7 +244,7 @@ def test_delete_tone(self): self.assertEqual(delete_tone("จิ้น"), "จิน") self.assertEqual(delete_tone("เก๋า"), "เกา") - with self.assertRaises(DeprecationWarning): + with self.assertWarns(DeprecationWarning): deletetone("จิ้น") def test_normalize(self): @@ -293,5 +293,5 @@ def test_is_native_thai(self): self.assertEqual(is_native_thai("เทเวศน์"), False) self.assertEqual(is_native_thai("เทเวศร์"), False) - with self.assertRaises(DeprecationWarning): + with self.assertWarns(DeprecationWarning): thaicheck("เลข") From 96b154c7611a6a6fc1ac5ac13b1c66afe5037b89 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Nov 2019 13:27:44 +0000 Subject: [PATCH 09/11] Commented out assertWarns until a bug in unittest get fixed - https://bugs.python.org/issue29620 --- tests/test_tokenize.py | 21 ++++++++++++++------- tests/test_util.py | 14 ++++++++++---- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index f7c96bb38..d47b966b3 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -58,28 +58,35 @@ def test_word_tokenize(self): self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut") ) - self.assertIsNotNone( - word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") - ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") ) + self.assertIsNotNone( + word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") + ) # XX engine is not existed self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) + self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE)) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) ) - self.assertIsNotNone( - word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE) + self.assertTrue( + "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) - with self.assertWarns(DeprecationWarning): - dict_word_tokenize("เลิกใช้แล้ว") + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE) + self.assertEqual( + word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), + dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), + ) def test_Tokenizer(self): t_test = Tokenizer(DEFAULT_DICT_TRIE) diff --git a/tests/test_util.py b/tests/test_util.py index 3461cab29..11ed7f171 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -244,8 +244,11 @@ def test_delete_tone(self): self.assertEqual(delete_tone("จิ้น"), "จิน") self.assertEqual(delete_tone("เก๋า"), "เกา") - with self.assertWarns(DeprecationWarning): - deletetone("จิ้น") + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # deletetone("จิ้น") + self.assertEqual(deletetone("จิ้น"), delete_tone("จิ้น")) def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") @@ -293,5 +296,8 @@ def test_is_native_thai(self): self.assertEqual(is_native_thai("เทเวศน์"), False) self.assertEqual(is_native_thai("เทเวศร์"), False) - with self.assertWarns(DeprecationWarning): - thaicheck("เลข") + # Commented out until this unittest bug get fixed: + # https://bugs.python.org/issue29620 + # with self.assertWarns(DeprecationWarning): + # thaicheck("เลข") + self.assertEqual(thaicheck("เลข"), is_native_thai("เลข")) From 73145c61760165859a69b8a2554ea9aafc752a45 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 16 Nov 2019 03:18:38 +0700 Subject: [PATCH 10/11] PyThaiNLP 2.1.dev8 --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b5175f11b..2d9c74efb 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,13 @@ Some functionalities, like named-entity recognition, required extra packages. See https://github.com/PyThaiNLP/pythainlp for installation options. + + +Made with ❤️ + +PyThaiNLP Team + +"We build Thai NLP" """ requirements = [ @@ -63,7 +70,7 @@ setup( name="pythainlp", - version="2.1.dev7", + version="2.1.dev8", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From 7b86c27abd64aae69274467772446e7285f204ed Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 16 Nov 2019 17:28:34 +0700 Subject: [PATCH 11/11] Update tox.ini (build and deploy docs) --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index e5573e5b5..2f7962f7b 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,7 @@ setenv = commands = python setup.py test -; If you want to make tox run the tests with the same versions, create a +; If you want to make tox run the tests with the same versions, create a ; requirements.txt with the pinned versions and uncomment the following lines: ; deps = ; -r{toxinidir}/requirements.txt