diff --git a/docs/api/util.rst b/docs/api/util.rst index 9a1554707..ef38bcd44 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -7,6 +7,11 @@ The :mod:`pythainlp.util` module serves as a treasure trove of utility functions Modules ------- +.. autofunction:: analyze_thai_text + :noindex: + + Analyzes a string of Thai text and returns a dictionaries, where each values represents a single classified character from the text. + .. autofunction:: abbreviation_to_full_text :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 7613257a6..d9f832c3f 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -69,6 +69,7 @@ "tone_detector", "tone_to_spelling", "words_to_num", + "analyze_thai_text", ] from pythainlp.util import spell_words @@ -121,6 +122,7 @@ isthai, isthaichar, thai_word_tone_detector, + analyze_thai_text, ) from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date from pythainlp.util.thaiwordcheck import is_native_thai diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index aea0bfbb2..a0fa2a5b1 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -8,6 +8,7 @@ import string from typing import Tuple +from collections import defaultdict from pythainlp import ( thai_above_vowels, @@ -26,6 +27,52 @@ _TH_FIRST_CHAR_ASCII = 3584 _TH_LAST_CHAR_ASCII = 3711 +# A comprehensive map of Thai characters to their descriptive names. +THAI_CHAR_NAMES = { + # Consonants + **{char: char for char in thai_consonants}, + # Vowels and Signs + "\u0e24": "ฤ", + "\u0e26": "ฦ", + "\u0e30": "สระ อะ", + "\u0e31": "ไม้หันอากาศ", + "\u0e32": "สระ อา", + "\u0e33": "สระ อำ", + "\u0e34": "สระ อิ", + "\u0e35": "สระ อี", + "\u0e36": "สระ อึ", + "\u0e37": "สระ อือ", + "\u0e38": "สระ อุ", + "\u0e39": "สระ อู", + "\u0e40": "สระ เอ", + "\u0e41": "สระ แอ", + "\u0e42": "สระ โอ", + "\u0e43": "สระ ใอ", + "\u0e44": "สระ ไอ", + "\u0e45": "ไม้ม้วน", + "\u0e4d": "นฤคหิต", + "\u0e47": "ไม้ไต่คู้", + # Tone Marks + "\u0e48": "ไม้เอก", + "\u0e49": "ไม้โท", + "\u0e4a": "ไม้ตรี", + "\u0e4b": "ไม้จัตวา", + # Other Signs + "\u0e2f": "ไปยาลน้อย", + "\u0e3a": "พินทุ", + "\u0e46": "ไม้ยมก", + "\u0e4c": "การันต์", + "\u0e4e": "ยามักการ", + # Punctuation + "\u0e4f": "ฟองมัน", + "\u0e5a": "อังคั่นคู่", + "\u0e5b": "โคมุต", + # Digits + **{char: char for char in thai_digits}, + # Symbol + "\u0e3f": "฿", +} + def isthaichar(ch: str) -> bool: """Check if a character is a Thai character. @@ -269,3 +316,38 @@ def count_thai_chars(text: str) -> dict: else: _dict["non_thai"] += 1 return _dict + + +def analyze_thai_text(text: str) -> dict: + """ + Analyzes a string of Thai text and returns a dictionaries, + where each values represents a single classified character from the text. + + The function processes the text character by character and maps each Thai + character to its descriptive name or itself (for consonants and digits). + + :param str text: The Thai text string to be analyzed. + :rtype: list[dict] + :return: A dictionaries, with each item containing + a single character and a count of 1. + + Examples: + >>> analyze_thai_text("คนดี") + {'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1} + + >>> analyze_thai_text("เล่น") + {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1} + """ + results = defaultdict(int) + + # Iterate over each character in the input string + for char in text: + # Check if the character is in our mapping + if char in THAI_CHAR_NAMES: + name = THAI_CHAR_NAMES[char] + results[name]+=1 + else: + # If the character is not a known Thai character, classify it as character + results[char]+=1 + + return dict(results) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 9c46ad2aa..7d01fb174 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -67,6 +67,7 @@ tone_detector, words_to_num, spelling, + analyze_thai_text, ) from pythainlp.util.morse import morse_decode, morse_encode @@ -874,3 +875,13 @@ def test_longest_common_subsequence(self): self.assertEqual(longest_common_subsequence("", "ABC"), "") self.assertEqual(longest_common_subsequence("ABC", ""), "") self.assertEqual(longest_common_subsequence("", ""), "") + + def test_analyze_thai_text(self): + self.assertEqual( + analyze_thai_text("คนดี"), + {"ค": 1, "น": 1, "ด": 1, "สระ อี": 1} + ) + self.assertEqual( + analyze_thai_text("เล่น"), + {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1} + )