From 4a327a8a7365406d10bc1b1e8c2a09c93b18ead3 Mon Sep 17 00:00:00 2001 From: HRNPH <51855316+HRNPH@users.noreply.github.com> Date: Fri, 21 Apr 2023 05:05:00 +0700 Subject: [PATCH 1/9] add option to count dead syllable as aek --- pythainlp/khavee/core.py | 10 ++++++---- pythainlp/khavee/example.py | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 0e3fa238a..8ae660e08 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from typing import List, Union from pythainlp.tokenize import subword_tokenize - +from pythainlp.util import sound_syllable class KhaveeVerifier: def __init__(self): @@ -380,11 +380,11 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: else: return 'Something went wrong Make sure you enter it in correct form.' - def check_aek_too(self, text: Union[List[str], str]) -> Union[List[bool], List[str], bool, str]: + def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = False) -> Union[List[bool], List[str], bool, str]: """ Thai tonal word checker - :param str or list[str] text: Thai word or list of Thai words + :param str or list[str] text: Thai word or list of Thai words, bool dead_syllable_as_aek: if True, dead syllable will be considered as aek :return: the check if the word is aek or too or False(not both) or list of the check if input is list :rtype: Union[List[bool], List[str], bool, str] @@ -402,7 +402,7 @@ def check_aek_too(self, text: Union[List[str], str]) -> Union[List[bool], List[s ## -> [False, 'aek', 'too'] """ if isinstance(text, list): - return [self.check_aek_too(t) for t in text] + return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] if not isinstance(text, str): raise TypeError('text must be str or iterable list[str]') @@ -412,5 +412,7 @@ def check_aek_too(self, text: Union[List[str], str]) -> Union[List[bool], List[s return 'aek' elif '้' in word_characters and not '่' in word_characters: return 'too' + if dead_syllable_as_aek and sound_syllable(text) == 'dead': + return 'aek' else: return False diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index deb7d0ab2..dce216edc 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -59,3 +59,5 @@ # -> False, aek, too print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน # -> [False, 'aek', 'too'] +print(kv.check_aek_too(['ห๊ะ', 'เอ่ง', 'เอ้ง'], dead_syllable_as_aek=True)) # ใช้ List ได้เหมือนกัน และสามารถตั้งค่า ให้นับคำที่เสียงตายเป็นเอกได้ ตามการเช็คคฉันทลักษณ์กลอน +# -> ['aek', 'aek', 'too'] From 485b12df94cafb1d9fa63b8488306c6fe20ae791 Mon Sep 17 00:00:00 2001 From: HRNPH <51855316+HRNPH@users.noreply.github.com> Date: Fri, 21 Apr 2023 06:12:04 +0700 Subject: [PATCH 2/9] add new karun handler to karvee matra checker --- pythainlp/khavee/core.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 77b38e51d..f89caa2df 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -214,11 +214,7 @@ def check_marttra(self, word: str) -> str: """ if word[-1] == 'ร' and word[-2] in ['ต','ท'] : word = word[:-1] - if '์' in word[-1]: - if 'ิ' in word[-2] or 'ุ' in word[-2]: - word = word[:-3] - else: - word = word[:-2] + word = self.handle_karun_sound_silenced(word) if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: return 'กา' elif word[-1] in ['า','ะ','ิ','ี','ุ','ู','อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]): @@ -451,3 +447,22 @@ def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = return 'aek' else: return False + + def handle_karun_sound_silence(text: str) -> str: + """ + Handle sound silence in Thai word using '์' character (Karun) + by stripping all the characters before the 'Karun' character that should be silenced + + :param str text: Thai word + :return: Thai word with silence word stripped + :rtype: str + """ + sound_silenced = True if word.endswith('์') else False + if not sound_silenced: + return text + thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" + locate_silenced = word.rfind('์') - 1 + can_silence_two = True if word[locate_silenced-2] in thai_consonants else False + cut_off = 2 if can_silence_two else 1 + word = word[:locate_silenced + 1 - cut_off] + return word From 8d9e319382b63546b9a2ee2b38fa125935107c34 Mon Sep 17 00:00:00 2001 From: HRNPH <51855316+HRNPH@users.noreply.github.com> Date: Sat, 22 Apr 2023 14:15:57 +0700 Subject: [PATCH 3/9] fix karun function name & parameters --- pythainlp/khavee/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index f89caa2df..2de5d89d6 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -214,7 +214,7 @@ def check_marttra(self, word: str) -> str: """ if word[-1] == 'ร' and word[-2] in ['ต','ท'] : word = word[:-1] - word = self.handle_karun_sound_silenced(word) + word = self.handle_karun_sound_silence(word) if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: return 'กา' elif word[-1] in ['า','ะ','ิ','ี','ุ','ู','อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]): @@ -448,7 +448,7 @@ def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = else: return False - def handle_karun_sound_silence(text: str) -> str: + def handle_karun_sound_silence(self, word: str) -> str: """ Handle sound silence in Thai word using '์' character (Karun) by stripping all the characters before the 'Karun' character that should be silenced @@ -459,7 +459,7 @@ def handle_karun_sound_silence(text: str) -> str: """ sound_silenced = True if word.endswith('์') else False if not sound_silenced: - return text + return word thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" locate_silenced = word.rfind('์') - 1 can_silence_two = True if word[locate_silenced-2] in thai_consonants else False From dd4f0a117cd303f8bb57a268f75369530f402b88 Mon Sep 17 00:00:00 2001 From: HRNPH Date: Sat, 22 Apr 2023 14:26:34 +0700 Subject: [PATCH 4/9] Fixing thai tonal checker type description --- pythainlp/khavee/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 2de5d89d6..dc892b75f 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -412,8 +412,8 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = False) -> Union[List[bool], List[str], bool, str]: """ Thai tonal word checker - - :param str or list[str] text: Thai word or list of Thai words + :param Union[List[str], str] text: Thai word or list of Thai words + :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek :return: the check if the word is aek or too or False(not both) or list of the check if input is list :rtype: Union[List[bool], List[str], bool, str] From 4ee2196026c7d32f3dcd19b24f422ad2ef2e0a4a Mon Sep 17 00:00:00 2001 From: HRNPH <51855316+HRNPH@users.noreply.github.com> Date: Thu, 27 Apr 2023 08:23:20 +0700 Subject: [PATCH 5/9] adding tonemark removal to fix mattra checking --- pythainlp/khavee/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 2de5d89d6..91beda20b 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -15,7 +15,7 @@ from typing import List, Union from pythainlp.tokenize import subword_tokenize from pythainlp.util import sound_syllable - +from pythainlp.util import remove_tonemark class KhaveeVerifier: def __init__(self): """ @@ -215,6 +215,7 @@ def check_marttra(self, word: str) -> str: if word[-1] == 'ร' and word[-2] in ['ต','ท'] : word = word[:-1] word = self.handle_karun_sound_silence(word) + word = remove_tonemark(word) if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: return 'กา' elif word[-1] in ['า','ะ','ิ','ี','ุ','ู','อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]): From a1161364626c9346adf2c6603cb922b17d71c226 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 3 May 2023 16:01:45 +0700 Subject: [PATCH 6/9] PyThaiNLP v4.0.1 --- pythainlp/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 73aa896c6..c06b7cf03 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -17,7 +17,7 @@ # # URL: # For license information, see LICENSE -__version__ = "4.0.0" +__version__ = "4.0.1" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/setup.cfg b/setup.cfg index 03a441c08..121adc632 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.0 +current_version = 4.0.1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 976c513db..92d7d5390 100644 --- a/setup.py +++ b/setup.py @@ -143,7 +143,7 @@ setup( name="pythainlp", - version="4.0.0", + version="4.0.1", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From f906d150bff8a1eb9965ff0ca18e37b984663be8 Mon Sep 17 00:00:00 2001 From: TripleKdev <80637250+kangkengkhadev@users.noreply.github.com> Date: Tue, 16 May 2023 02:19:41 +0700 Subject: [PATCH 7/9] fixed bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in case ำ and า --- pythainlp/khavee/core.py | 6 ++++-- pythainlp/khavee/example.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 1eede7006..1dfa7e6d1 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -184,10 +184,12 @@ def check_sara(self, word: str)-> str: sara.append('ออ') # incase บ่ - if 'บ่' in word: + if 'บ่' == word: sara = [] sara.append('ออ') - + if 'ํ' in word: + sara = [] + sara.append('อำ') if sara == []: return 'Cant find Sara in this word' else: diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index ad1726fa2..cd123d7b3 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -16,12 +16,13 @@ # True # การตรวจสอบคำสำผัสที่ผิด -print('สรร ขวาน',kv.is_sumpus('สรร','ขวาน')) +print('นํ้า กา',kv.is_sumpus('นํ้า','กา')) # False # การตรวจสอบคำ ครุ ลหุ print('สรร',kv.check_karu_lahu('สรร')) #karu + # การตรวจสอบคำ ครุ ลหุ print('ชิชะ',kv.check_karu_lahu('ชิชะ')) # lahu From 231a2337740221009c66526568d0a0105ebccaed Mon Sep 17 00:00:00 2001 From: TripleKdev <80637250+kangkengkhadev@users.noreply.github.com> Date: Sun, 21 May 2023 22:48:08 +0700 Subject: [PATCH 8/9] =?UTF-8?q?fig=20=E0=B9=80=E0=B8=AD=E0=B8=B7=E0=B8=AD?= =?UTF-8?q?=E0=B8=99=20=E0=B8=AD=E0=B8=A7=E0=B8=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pythainlp/khavee/core.py | 3 +++ pythainlp/khavee/example.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 1dfa7e6d1..be007718e 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -190,6 +190,9 @@ def check_sara(self, word: str)-> str: if 'ํ' in word: sara = [] sara.append('อำ') + if 'เ' in word and 'ื' in word and 'อ' in word: + sara = [] + sara.append('เอือ') if sara == []: return 'Cant find Sara in this word' else: diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index cd123d7b3..b6dfba79c 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -16,7 +16,7 @@ # True # การตรวจสอบคำสำผัสที่ผิด -print('นํ้า กา',kv.is_sumpus('นํ้า','กา')) +print('เพื่อน ล้วน',kv.is_sumpus('เพื่อน','ล้วน')) # False # การตรวจสอบคำ ครุ ลหุ From 33c5b5a270f8775407f95bdfccce804a32dff024 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 31 May 2023 00:10:43 +0700 Subject: [PATCH 9/9] PyThaiNLP v4.0.2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix เอือน อวน #798 --- pythainlp/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index c06b7cf03..9feccaa53 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -17,7 +17,7 @@ # # URL: # For license information, see LICENSE -__version__ = "4.0.1" +__version__ = "4.0.2" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/setup.py b/setup.py index 92d7d5390..c03533bf7 100644 --- a/setup.py +++ b/setup.py @@ -143,7 +143,7 @@ setup( name="pythainlp", - version="4.0.1", + version="4.0.2", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown",