Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ The :mod:`pythainlp.util` module serves as a treasure trove of utility functions
Modules
-------

.. autofunction:: analyze_thai_text
:noindex:

Analyzes a string of Thai text and returns a dictionaries, where each values represents a single classified character from the text.

.. autofunction:: abbreviation_to_full_text
:noindex:

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"tone_detector",
"tone_to_spelling",
"words_to_num",
"analyze_thai_text",
]

from pythainlp.util import spell_words
Expand Down Expand Up @@ -121,6 +122,7 @@
isthai,
isthaichar,
thai_word_tone_detector,
analyze_thai_text,
)
from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date
from pythainlp.util.thaiwordcheck import is_native_thai
Expand Down
82 changes: 82 additions & 0 deletions pythainlp/util/thai.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import string
from typing import Tuple
from collections import defaultdict

from pythainlp import (
thai_above_vowels,
Expand All @@ -26,6 +27,52 @@
_TH_FIRST_CHAR_ASCII = 3584
_TH_LAST_CHAR_ASCII = 3711

# A comprehensive map of Thai characters to their descriptive names.
THAI_CHAR_NAMES = {
# Consonants
**{char: char for char in thai_consonants},
# Vowels and Signs
"\u0e24": "ฤ",
"\u0e26": "ฦ",
"\u0e30": "สระ อะ",
"\u0e31": "ไม้หันอากาศ",
"\u0e32": "สระ อา",
"\u0e33": "สระ อำ",
"\u0e34": "สระ อิ",
"\u0e35": "สระ อี",
"\u0e36": "สระ อึ",
"\u0e37": "สระ อือ",
"\u0e38": "สระ อุ",
"\u0e39": "สระ อู",
"\u0e40": "สระ เอ",
"\u0e41": "สระ แอ",
"\u0e42": "สระ โอ",
"\u0e43": "สระ ใอ",
"\u0e44": "สระ ไอ",
"\u0e45": "ไม้ม้วน",
"\u0e4d": "นฤคหิต",
"\u0e47": "ไม้ไต่คู้",
# Tone Marks
"\u0e48": "ไม้เอก",
"\u0e49": "ไม้โท",
"\u0e4a": "ไม้ตรี",
"\u0e4b": "ไม้จัตวา",
# Other Signs
"\u0e2f": "ไปยาลน้อย",
"\u0e3a": "พินทุ",
"\u0e46": "ไม้ยมก",
"\u0e4c": "การันต์",
"\u0e4e": "ยามักการ",
# Punctuation
"\u0e4f": "ฟองมัน",
"\u0e5a": "อังคั่นคู่",
"\u0e5b": "โคมุต",
# Digits
**{char: char for char in thai_digits},
# Symbol
"\u0e3f": "฿",
}


def isthaichar(ch: str) -> bool:
"""Check if a character is a Thai character.
Expand Down Expand Up @@ -269,3 +316,38 @@ def count_thai_chars(text: str) -> dict:
else:
_dict["non_thai"] += 1
return _dict


def analyze_thai_text(text: str) -> dict:
"""
Analyzes a string of Thai text and returns a dictionaries,
where each values represents a single classified character from the text.

The function processes the text character by character and maps each Thai
character to its descriptive name or itself (for consonants and digits).

:param str text: The Thai text string to be analyzed.
:rtype: list[dict]
:return: A dictionaries, with each item containing
a single character and a count of 1.

Examples:
>>> analyze_thai_text("คนดี")
{'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1}

>>> analyze_thai_text("เล่น")
{'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
"""
results = defaultdict(int)

# Iterate over each character in the input string
for char in text:
# Check if the character is in our mapping
if char in THAI_CHAR_NAMES:
name = THAI_CHAR_NAMES[char]
results[name]+=1
else:
# If the character is not a known Thai character, classify it as character
results[char]+=1

return dict(results)
11 changes: 11 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
tone_detector,
words_to_num,
spelling,
analyze_thai_text,
)
from pythainlp.util.morse import morse_decode, morse_encode

Expand Down Expand Up @@ -874,3 +875,13 @@ def test_longest_common_subsequence(self):
self.assertEqual(longest_common_subsequence("", "ABC"), "")
self.assertEqual(longest_common_subsequence("ABC", ""), "")
self.assertEqual(longest_common_subsequence("", ""), "")

def test_analyze_thai_text(self):
self.assertEqual(
analyze_thai_text("คนดี"),
{"ค": 1, "น": 1, "ด": 1, "สระ อี": 1}
)
self.assertEqual(
analyze_thai_text("เล่น"),
{'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
)
Loading