Skip to content
Merged
5 changes: 4 additions & 1 deletion pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def corpus_db_path() -> str:
def get_corpus_db_detail(name: str) -> dict:
db = TinyDB(corpus_db_path())
query = Query()

res = db.search(query.name == name)
db.close()

if res:
return res[0]
else:
Expand Down Expand Up @@ -286,8 +287,10 @@ def remove(name: str) -> bool:
path = get_corpus_path(name)
os.remove(path)
db.remove(query.name == name)
db.close()
return True

db.close()
return False


Expand Down
4 changes: 2 additions & 2 deletions pythainlp/tokenize/attacut.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
"""
from typing import List

import attacut
from attacut import tokenize


def segment(text: str) -> List[str]:
if not text or not isinstance(text, str):
return []

return attacut.tokenize(text)
return tokenize(text)
6 changes: 3 additions & 3 deletions pythainlp/tokenize/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from typing import List, Union

import deepcut
from deepcut import tokenize

from .trie import Trie

Expand All @@ -22,6 +22,6 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[
if isinstance(custom_dict, Trie):
custom_dict = list(custom_dict)

return deepcut.tokenize(text, custom_dict)
return tokenize(text, custom_dict)

return deepcut.tokenize(text)
return tokenize(text)
4 changes: 2 additions & 2 deletions pythainlp/tokenize/ssg.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
from typing import List

import ssg
from ssg import syllable_tokenize


def segment(text: str) -> List[str]:
if not text or not isinstance(text, str):
return []

return ssg.syllable_tokenize(text)
return syllable_tokenize(text)
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@

Some functionalities, like named-entity recognition, required extra packages.
See https://github.com/PyThaiNLP/pythainlp for installation options.


Made with ❤️

PyThaiNLP Team

"We build Thai NLP"
"""

requirements = [
Expand Down Expand Up @@ -63,7 +70,7 @@

setup(
name="pythainlp",
version="2.1.dev7",
version="2.1.dev8",
description="Thai Natural Language Processing library",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
6 changes: 6 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ def test_corpus(self):
self.assertIsNotNone(thai_words())
self.assertIsNotNone(thai_female_names())
self.assertIsNotNone(thai_male_names())
self.assertEqual(get_corpus_db_detail("XXX"), {})
self.assertIsNone(download("test"))
self.assertIsNone(download("test", force=True))
self.assertIsNotNone(get_corpus_db_detail("test"))
self.assertIsNotNone(remove("test"))
self.assertFalse(remove("test"))

def test_tnc(self):
self.assertIsNotNone(tnc.word_freqs())
Expand All @@ -48,6 +50,7 @@ def test_ttc(self):

def test_wordnet(self):
self.assertIsNotNone(wordnet.langs())
self.assertTrue("tha" in wordnet.langs())

self.assertEqual(
wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]
Expand All @@ -69,6 +72,9 @@ def test_wordnet(self):
self.assertEqual(
wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)
)
self.assertEqual(
wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse)
)

cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
39 changes: 30 additions & 9 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@


class TestTokenizePackage(unittest.TestCase):
def test_dict_word_tokenize(self):
self.assertEqual(dict_word_tokenize(""), [])

def test_etcc(self):
self.assertEqual(etcc.segment(""), "")
self.assertIsInstance(etcc.segment("คืนความสุข"), list)
Expand Down Expand Up @@ -61,24 +58,34 @@ def test_word_tokenize(self):
self.assertIsNotNone(
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")
)
self.assertIsNotNone(
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
)
self.assertIsNotNone(
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut")
)
self.assertIsNotNone(
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
) # XX engine is not existed

self.assertIsNotNone(dict_trie(()))
self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"}))
self.assertIsNotNone(dict_trie(thai_words()))
self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE))
self.assertIsNotNone(
dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
)

self.assertIsNotNone(
word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE)
self.assertTrue(
"ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
)

# Commented out until this unittest bug get fixed:
# https://bugs.python.org/issue29620
# with self.assertWarns(DeprecationWarning):
# dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE)
self.assertEqual(
word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
)

def test_Tokenizer(self):
Expand Down Expand Up @@ -224,31 +231,45 @@ def test_sent_tokenize(self):
def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None), [])
self.assertEqual(subword_tokenize(""), [])

self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"))
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc")
)

self.assertEqual(subword_tokenize(None, engine="etcc"), [])
self.assertEqual(subword_tokenize("", engine="etcc"), [])
self.assertIsNotNone(
subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc")
)
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc")
)
self.assertIsNotNone(subword_tokenize("เบียร์สิงห์", engine="etcc"))

self.assertEqual(subword_tokenize(None, engine="ssg"), [])
self.assertEqual(subword_tokenize("", engine="ssg"), [])
self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="ssg"))
self.assertTrue(
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
)
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
)

def test_syllable_tokenize(self):
self.assertEqual(syllable_tokenize(None), [])
self.assertEqual(syllable_tokenize(""), [])
self.assertEqual(
syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
)
self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก"))

self.assertEqual(syllable_tokenize(None, engine="ssg"), [])
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
self.assertEqual(
syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
)
self.assertFalse("า" in syllable_tokenize("แมวกินปลา", engine="etcc"))

def test_tcc(self):
self.assertEqual(tcc.segment(None), [])
Expand Down
49 changes: 37 additions & 12 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ def test_thai_strftime(self):
# ### pythainlp.util.thai_time

def test_thai_time(self):
self.assertEqual(thai_time("8:17"), thai_time("08:17"))
self.assertEqual(thai_time("8:17"), "แปดนาฬิกาสิบเจ็ดนาที")
self.assertEqual(thai_time("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที")
self.assertEqual(thai_time("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที")
self.assertEqual(thai_time("18:30", "m6h"), "หกโมงครึ่ง")
self.assertEqual(thai_time("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง")
self.assertEqual(
thai_time(datetime.time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
Expand All @@ -181,23 +181,38 @@ def test_thai_time(self):
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
)
self.assertEqual(
thai_time(
datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"
),
thai_time(datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"),
"สิบสองนาฬิกาสามนาทีศูนย์วินาที",
)
self.assertEqual(
thai_time(
datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"
),
thai_time(datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"),
"สิบสองนาฬิกาสามนาที",
)
self.assertEqual(
thai_time(
datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"
),
thai_time(datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"),
"เที่ยงครึ่ง",
)
self.assertEqual(thai_time("18:30"), "สิบแปดนาฬิกาสามสิบนาที")
self.assertEqual(thai_time("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที")
self.assertEqual(
thai_time("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที"
)
self.assertEqual(
thai_time("18:30:01", precision="m"), "สิบแปดนาฬิกาสามสิบนาที"
)
self.assertEqual(
thai_time("18:30:01", precision="s"),
"สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที",
)
self.assertEqual(
thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง"
)
self.assertEqual(
thai_time("18:30:01", fmt="m6h"), "หกโมงสามสิบนาทีหนึ่งวินาที"
)
self.assertEqual(
thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง"
)
self.assertIsNotNone(thai_time("0:30"))
self.assertIsNotNone(thai_time("0:30", "6h"))
self.assertIsNotNone(thai_time("0:30", "m6h"))
Expand Down Expand Up @@ -228,7 +243,12 @@ def test_thai_time(self):
def test_delete_tone(self):
self.assertEqual(delete_tone("จิ้น"), "จิน")
self.assertEqual(delete_tone("เก๋า"), "เกา")
self.assertEqual(delete_tone("จิ้น"), deletetone("จิ้น"))

# Commented out until this unittest bug get fixed:
# https://bugs.python.org/issue29620
# with self.assertWarns(DeprecationWarning):
# deletetone("จิ้น")
self.assertEqual(deletetone("จิ้น"), delete_tone("จิ้น"))

def test_normalize(self):
self.assertEqual(normalize("เเปลก"), "แปลก")
Expand Down Expand Up @@ -256,7 +276,6 @@ def test_isthai(self):
self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True)

def test_is_native_thai(self):
self.assertEqual(is_native_thai("เลข"), thaicheck("เลข"))
self.assertEqual(is_native_thai(None), False)
self.assertEqual(is_native_thai(""), False)
self.assertEqual(is_native_thai("116"), False)
Expand All @@ -276,3 +295,9 @@ def test_is_native_thai(self):
self.assertEqual(is_native_thai("เลข"), False)
self.assertEqual(is_native_thai("เทเวศน์"), False)
self.assertEqual(is_native_thai("เทเวศร์"), False)

# Commented out until this unittest bug get fixed:
# https://bugs.python.org/issue29620
# with self.assertWarns(DeprecationWarning):
# thaicheck("เลข")
self.assertEqual(thaicheck("เลข"), is_native_thai("เลข"))
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ setenv =

commands = python setup.py test

; If you want to make tox run the tests with the same versions, create a
; If you want to make tox run the tests with the same versions, create a
; requirements.txt with the pinned versions and uncomment the following lines:
; deps =
; -r{toxinidir}/requirements.txt