Skip to content

Commit

Permalink
Merge branch 'dev' into delete-lst20-model
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Oct 13, 2022
2 parents e419f20 + 2606f85 commit 7896113
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 84 deletions.
87 changes: 87 additions & 0 deletions pythainlp/tokenize/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
Utility functions for tokenize module.
"""

import re
from typing import List, Callable

_DIGITS_WITH_SEPARATOR = re.compile(r"(\d+[\.\,:])+\d+")


def apply_postprocessors(
segments: List[str], postprocessors: Callable[[List[str]], List[str]]
) -> List[str]:
"""
A list of callables to apply on a raw segmentation result.
"""
for func in postprocessors:
segments = func(segments)

return segments


def rejoin_formatted_num(segments: List[str]) -> List[str]:
"""
Rejoin well-known formatted numeric that are over-tokenized.
The formatted numeric are numbers separated by ":", ",", or ".",
such as time, decimal number, comma-added number, and IP address.
:param List[str] segments: result from word tokenizer
:return: a list of fixed tokens
:rtype: List[str]
:Example:
tokens = ['ขณะ', 'นี้', 'เวลา', ' ', '12', ':', '00น', ' ', 'อัตรา',
'แลกเปลี่ยน', ' ', '1', ',', '234', '.', '5', ' ', 'baht/zeny']
rejoin_formatted_num(tokens)
# output:
# ['ขณะ', 'นี้', 'เวลา', ' ', '12:00น', ' ', 'อัตรา', 'แลกเปลี่ยน', ' ', '1,234.5', ' ', 'baht/zeny']
tokens = ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127', '.', '0', '.', '0', '.', '1', ' ', 'ครับ']
rejoin_formatted_num(tokens)
# output:
# ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127.0.0.1', ' ', 'ครับ']
"""
original = "".join(segments)
matching_results = _DIGITS_WITH_SEPARATOR.finditer(original)
tokens_joined = []
pos = 0
segment_idx = 0

match = next(matching_results, None)
while segment_idx < len(segments) and match:
is_span_beginning = pos >= match.start()
token = segments[segment_idx]
if is_span_beginning:
connected_token = ""
while pos < match.end() and segment_idx < len(segments):
connected_token += segments[segment_idx]
pos += len(segments[segment_idx])
segment_idx += 1

tokens_joined.append(connected_token)
match = next(matching_results, None)
else:
tokens_joined.append(token)
segment_idx += 1
pos += len(token)
tokens_joined += segments[segment_idx:]
return tokens_joined


def strip_whitespace(segments: List[str]) -> List[str]:
"""
Strip whitespace(s) off each token and remove whitespace tokens.
:param List[str] segments: result from word tokenizer
:return: a list of tokens
:rtype: List[str]
:Example:
tokens = [" ", "วันนี้ ", "เวลา ", "19.00น"]
strip_whitespace(tokens)
# ["วันนี้", "เวลา", "19.00น"]
"""
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
111 changes: 77 additions & 34 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
DEFAULT_WORD_DICT_TRIE,
DEFAULT_WORD_TOKENIZE_ENGINE,
)
from pythainlp.tokenize._utils import (
apply_postprocessors,
rejoin_formatted_num,
strip_whitespace,
)
from pythainlp.util.trie import Trie, dict_trie


Expand All @@ -31,6 +36,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
if isinstance(segments[0], str):
segments = [segments]
from pythainlp import thai_characters

for i, s in enumerate(segments):
_list_sents = []
_add_index = []
Expand All @@ -39,7 +45,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
for j, w in enumerate(s):
if j > 0:
# previous word
p_w = s[j-1]
p_w = s[j - 1]
# if w is number or other language and not be space
if (
w[0] not in thai_characters
Expand All @@ -57,9 +63,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
if not p_w.isspace():
_list_sents.append(" ")
_mark_index.append(j)
elif w.isspace() and j-1 not in _space_index:
elif w.isspace() and j - 1 not in _space_index:
_space_index.append(j)
elif j-1 in _mark_index:
elif j - 1 in _mark_index:
_list_sents.append(" ")
_list_sents.append(w)
_list_all.append(_list_sents)
Expand All @@ -72,14 +78,15 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
for j in i:
_temp += j
_text.append(_temp)
return ' '.join(_text)
return " ".join(_text)


def word_tokenize(
text: str,
custom_dict: Trie = None,
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
join_broken_num: bool = True,
) -> List[str]:
"""
Word tokenizer.
Expand All @@ -92,37 +99,47 @@ def word_tokenize(
:param bool keep_whitespace: True to keep whitespaces, a common mark
for end of phrase in Thai.
Otherwise, whitespaces are omitted.
:param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
Otherwise, formatted numeric could be wrongly separated.
:return: list of words
:rtype: List[str]
**Options for engine**
* *newmm* (default) - dictionary-based, Maximum Matching +
Thai Character Cluster
* *newmm-safe* - newmm, with a mechanism to help avoid long
processing time for text with continuous ambiguous breaking points
* *mm* or *multi_cut* - dictionary-based, Maximum Matching.
* *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
* *longest* - dictionary-based, Longest Matching
* *icu* - wrapper for ICU (International Components for Unicode,
using PyICU), dictionary-based
* *attacut* - wrapper for
`AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
learning-based approach
* *deepcut* - wrapper for
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
learning-based approach
* *nercut* - Dictionary-based maximal matching word segmentation,
* *icu* - wrapper for a word tokenizer in
`PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
from ICU (International Components for Unicode),
dictionary-based
* *longest* - dictionary-based, longest matching
* *mm* - "multi-cut", dictionary-based, maximum matching
* *nercut* - dictionary-based, maximal matching,
constrained with Thai Character Cluster (TCC) boundaries,
and combining tokens that are parts of the same named-entity.
combining tokens that are parts of the same named-entity
* *newmm* (default) - "new multi-cut",
dictionary-based, maximum matching,
constrained with Thai Character Cluster (TCC) boundaries
* *newmm-safe* - newmm, with a mechanism to avoid long
processing time for text with continuous ambiguous breaking points
* *nlpo3* - wrapper for a word tokenizer in
`nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
newmm adaptation in Rust (2.5x faster)
* *oskut* - wrapper for
`OSKut <https://github.com/mrpeerat/OSKut>`_.,
Out-of-domain StacKed cut for Word Segmentation
* *sefr_cut* - wrapper for
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
Stacked Ensemble Filter and Refine for Word Segmentation
* *tltk* - wrapper for
`TLTK <https://pypi.org/project/tltk/>`_.,
* *oskut* - wrapper for
`OSKut <https://github.com/mrpeerat/OSKut>`_.,
maximum collocation approach
:Note:
- The parameter **custom_dict** can be provided as an argument \
only for *newmm*, *longest*, and *deepcut* engine.
- The **custom_dict** parameter only works for \
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
:Example:
Tokenize text with different tokenizer::
Expand All @@ -147,6 +164,19 @@ def word_tokenize(
word_tokenize(text, engine="newmm", keep_whitespace=False)
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
Join broken formatted numeric (e.g. time, decimals, IP address)::
text = "เงิน1,234บาท19:32น 127.0.0.1"
word_tokenize(text, engine="attacut", join_broken_num=False)
# output:
# ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
# '127', '.', '0', '.', '0', '.', '1']
word_tokenize(text, engine="attacut", join_broken_num=True)
# output:
# ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
Tokenize with default and custom dictionary::
Expand All @@ -168,8 +198,8 @@ def word_tokenize(
word_tokenize(text, engine="newmm", custom_dict=trie))
# output:
# ['ชินโซ', ' ', 'อาเบะ',
# ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
# ['ชินโซ', ' ', 'อาเบะ', ' ',
# 'เกิด', ' ', '21', ' ', 'กันยายน']
"""
if not text or not isinstance(text, str):
return []
Expand Down Expand Up @@ -226,6 +256,7 @@ def word_tokenize(
segments = segment(text)
elif engine == "nlpo3":
from pythainlp.tokenize.nlpo3 import segment

if isinstance(custom_dict, str):
segments = segment(text, custom_dict=custom_dict)
elif not isinstance(custom_dict, str) and custom_dict is not None:
Expand All @@ -243,8 +274,14 @@ def word_tokenize(
It might be a typo; if not, please consult our document."""
)

postprocessors = []
if join_broken_num:
postprocessors.append(rejoin_formatted_num)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
postprocessors.append(strip_whitespace)

segments = apply_postprocessors(segments, postprocessors)

return segments

Expand All @@ -266,12 +303,12 @@ def sent_tokenize(
:rtype: list[str]
**Options for engine**
* *crfcut* - (default) split by CRF trained on TED dataset
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
* *whitespace+newline* - split by whitespaces and newline.
* *whitespace* - split by whitespaces. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
:Example:
Split the text based on *whitespace*::
Expand Down Expand Up @@ -333,7 +370,10 @@ def sent_tokenize(

segments = segment(text)
elif engine == "thaisum":
from pythainlp.tokenize.thaisumcut import ThaiSentenceSegmentor as segmentor
from pythainlp.tokenize.thaisumcut import (
ThaiSentenceSegmentor as segmentor,
)

segment = segmentor()
segments = segment.split_into_sentences(text)
else:
Expand All @@ -343,7 +383,7 @@ def sent_tokenize(
)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
segments = strip_whitespace(segments)

return segments

Expand Down Expand Up @@ -374,13 +414,12 @@ def subword_tokenize(
:return: list of subwords
:rtype: list[str]
**Options for engine**
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *wangchanberta* - SentencePiece from wangchanberta model.
* *dict* - newmm word tokenizer with a syllable dictionary
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *ssg* - CRF syllable segmenter for Thai
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *tltk* - syllable tokenizer from tltk
* *wangchanberta* - SentencePiece from wangchanberta model
:Example:
Tokenize text into subword based on *tcc*::
Expand Down Expand Up @@ -454,7 +493,7 @@ def subword_tokenize(
segments = segment(text)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
segments = strip_whitespace(segments)

return segments

Expand Down Expand Up @@ -531,6 +570,7 @@ def __init__(
custom_dict: Union[Trie, Iterable[str], str] = None,
engine: str = "newmm",
keep_whitespace: bool = True,
join_broken_num: bool = True,
):
"""
Initialize tokenizer object.
Expand All @@ -553,9 +593,11 @@ def __init__(
raise NotImplementedError(
"""
The Tokenizer class is not support %s for custom tokenizer
""" % self.__engine
"""
% self.__engine
)
self.__keep_whitespace = keep_whitespace
self.__join_broken_num = join_broken_num

def word_tokenize(self, text: str) -> List[str]:
"""
Expand All @@ -570,6 +612,7 @@ def word_tokenize(self, text: str) -> List[str]:
custom_dict=self.__trie_dict,
engine=self.__engine,
keep_whitespace=self.__keep_whitespace,
join_broken_num=self.__join_broken_num,
)

def set_tokenize_engine(self, engine: str) -> None:
Expand Down
Loading

0 comments on commit 7896113

Please sign in to comment.