Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,14 @@ def tokenize(self, text: str) -> List[str]:
return tokens


def segment(text: str, custom_dict: Trie = None) -> List[str]:
"""ตัดคำภาษาไทยด้วยวิธี longest matching"""
def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
"""
Dictionary-based longest matching word segmentation.

:param str text: text to be tokenized to words
:param pythainlp.trie.Trie custom_dict: dictionary for tokenization
:return: list of words, tokenized from the text
"""
if not text or not isinstance(text, str):
return []

Expand Down
14 changes: 10 additions & 4 deletions pythainlp/tokenize/multi_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, value, multi=None, in_dict=True):
_PAT_ENG = re.compile(_RE_ENG)


def _multicut(text: str, custom_dict: Trie = None):
def _multicut(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE):
"""
ส่งคืน LatticeString คืนมาเป็นก้อนๆ
"""
Expand Down Expand Up @@ -122,17 +122,23 @@ def _combine(ww: str):
yield m.replace("/", "|") + "|" + tail


def segment(text: str, custom_dict: Trie = None) -> List[str]:
def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
"""
ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
Dictionary-based maximum matching word segmentation.

:param str text: text to be tokenized to words
:param pythainlp.trie.Trie custom_dict: dictionary for tokenization
:return: list of words, tokenized from the text
"""
if not text or not isinstance(text, str):
return []

return list(_multicut(text, custom_dict=custom_dict))


def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
def find_all_segment(
text: str, custom_dict: Trie = DEFAULT_DICT_TRIE
) -> List[str]:
"""
Get all possible segment variations

Expand Down
3 changes: 3 additions & 0 deletions pythainlp/tokenize/ssg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@


def segment(text: str) -> List[str]:
if not text or not isinstance(text, str):
return []

return ssg.syllable_tokenize(text)
Loading