Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 43 additions & 19 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def word_tokenize(
from .newmm import segment as segment_

def segment(text):
return segment_(text, trie=FROZEN_DICT_TRIE)
return segment_(text, custom_dict=FROZEN_DICT_TRIE)

elif engine == "icu":
from .pyicu import segment
Expand All @@ -58,20 +58,26 @@ def segment(text):
else: # default, use "newmm" engine
from .newmm import segment

if not whitespaces:
return [token.strip(" ") for token in segment(text) if token.strip(" ")]
segments = segment(text)

return segment(text)
if whitespaces:
return segments

return [token.strip(" ") for token in segments if token.strip(" ")]


def dict_word_tokenize(
text: str, custom_dict: Trie, engine: str = "newmm"
text: str,
custom_dict: Union[Trie, Iterable[str], str] = DEFAULT_DICT_TRIE,
engine: str = "newmm",
whitespaces: bool = True,
) -> List[str]:
"""
:meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
:param str text: text to be tokenized
:param dict custom_dict: a dictionary trie
:param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut)
:param dict custom_dict: a dictionary trie, or an iterable of words, or a string of dictionary path
:param str engine: choose between different options of engine to token (newmm [default], mm, longest, and deepcut)
:param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai
:return: list of words
**Example**::
>>> from pythainlp.tokenize import dict_word_tokenize, dict_trie
Expand All @@ -86,16 +92,32 @@ def dict_word_tokenize(

if engine == "newmm" or engine == "onecut":
from .newmm import segment

custom_dict = dict_trie(custom_dict)
elif engine == "longest" or engine == "longest-matching":
from .longest import segment

custom_dict = dict_trie(custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment

custom_dict = dict_trie(custom_dict)
elif engine == "deepcut":
from .deepcut import segment
return segment(text,list(custom_dict))

if not isinstance(custom_dict, List) and not isinstance(custom_dict, str):
custom_dict = list(custom_dict)
else: # default, use "newmm" engine
from .newmm import segment
return segment(text, custom_dict)

custom_dict = dict_trie(custom_dict)

segments = segment(text, custom_dict)

if whitespaces:
return segments

return [token.strip(" ") for token in segments if token.strip(" ")]


def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
Expand Down Expand Up @@ -131,15 +153,12 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
:return: a list of tokenized strings.
"""
if not text:
return ""
return []

if engine == "etcc":
from .etcc import segment

return segment(text)

# default is "tcc"
from .tcc import segment
else: # default
from .tcc import segment

return segment(text)

Expand All @@ -164,7 +183,7 @@ def syllable_tokenize(text: str) -> List[str]:
return tokens


def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
"""
Create a dict trie which will be used for word_tokenize() function.
For more information on the trie data structure,
Expand All @@ -173,20 +192,25 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie:
:param string/list dict_source: a list of vocaburaries or a path to source file
:return: a trie created from a dictionary input
"""
trie = None

if type(dict_source) is str:
# Receive a file path of the dict to read
with open(dict_source, "r", encoding="utf8") as f:
_vocabs = f.read().splitlines()
return Trie(_vocabs)
trie = Trie(_vocabs)
elif isinstance(dict_source, Iterable):
# Received a sequence type object of vocabs
return Trie(dict_source)
trie = Trie(dict_source)
elif isinstance(dict_source, Trie):
trie = dict_source
else:
raise TypeError(
"Type of dict_source must be either str (path to source file) or iterable"
"Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
)

return trie


class Tokenizer:
def __init__(
Expand Down
17 changes: 13 additions & 4 deletions pythainlp/tokenize/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,21 @@
Wrapper for deepcut Thai word segmentation
"""

from typing import List
from typing import List, Union

import deepcut

from marisa_trie import Trie


def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
if not text:
return []

if custom_dict:
if isinstance(custom_dict, Trie):
custom_dict = list(custom_dict)

return deepcut.tokenize(text, custom_dict)

def segment(text: str,dict_source:List[str]=None) -> List[str]:
if dict_source!=None:
return deepcut.tokenize(text, custom_dict=dict_source)
return deepcut.tokenize(text)
18 changes: 12 additions & 6 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
"""
import re
from typing import List

from pythainlp.tokenize import DEFAULT_DICT_TRIE

from marisa_trie import Trie

_FRONT_DEP_CHAR = [
"ะ",
"ั",
Expand Down Expand Up @@ -36,7 +39,7 @@


class LongestMatchTokenizer(object):
def __init__(self, trie):
def __init__(self, trie: Trie):
self.__trie = trie

def __search_nonthai(self, text: str):
Expand Down Expand Up @@ -130,14 +133,17 @@ def __segment_text(self, text: str):

return tokens

def tokenize(self, text):
def tokenize(self, text: str) -> List[str]:
tokens = self.__segment_text(text)
return tokens


def segment(text, trie=None):
def segment(text: str, custom_dict: Trie = None) -> List[str]:
"""ตัดคำภาษาไทยด้วยวิธี longest matching"""
if not trie:
trie = DEFAULT_DICT_TRIE
if not text:
return []

if not custom_dict:
custom_dict = DEFAULT_DICT_TRIE

return LongestMatchTokenizer(trie).tokenize(text)
return LongestMatchTokenizer(custom_dict).tokenize(text)
27 changes: 16 additions & 11 deletions pythainlp/tokenize/multi_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
"""
import re
from collections import defaultdict
from typing import List

from pythainlp.tokenize import DEFAULT_DICT_TRIE

from marisa_trie import Trie


class LatticeString(str):
"""
Expand Down Expand Up @@ -40,13 +43,14 @@ def __init__(self, value, multi=None, in_dict=True):
_PAT_ENG = re.compile(_RE_ENG)


def _multicut(text, trie=None):
def _multicut(text: str, custom_dict: Trie = None):
"""
ส่งคืน LatticeString คืนมาเป็นก้อนๆ
"""
if not custom_dict:
custom_dict = DEFAULT_DICT_TRIE

len_text = len(text)
if not trie:
trie = DEFAULT_DICT_TRIE
words_at = defaultdict(list) # main data structure

def serialize(p, p2): # helper function
Expand All @@ -64,7 +68,7 @@ def serialize(p, p2): # helper function
p = min(q)
q -= {p} # q.pop, but for set

for w in trie.prefixes(text[p:]):
for w in custom_dict.prefixes(text[p:]):
words_at[p].append(w)
q.add(p + len(w))

Expand All @@ -80,7 +84,7 @@ def serialize(p, p2): # helper function
i = p + m.span()[1]
else: # skip น้อยที่สุด ที่เป็นไปได้
for i in range(p, len_text):
ww = trie.prefixes(text[i:])
ww = custom_dict.prefixes(text[i:])
m = _PAT_ENG.match(text[i:])
if ww or m:
break
Expand All @@ -93,15 +97,15 @@ def serialize(p, p2): # helper function
q.add(i)


def mmcut(text):
def mmcut(text: str):
res = []
for w in _multicut(text):
mm = min(w.multi, key=lambda x: x.count("/"))
res.extend(mm.split("/"))
return res


def _combine(ww):
def _combine(ww: str):
if ww == []:
yield ""
else:
Expand All @@ -114,22 +118,23 @@ def _combine(ww):
yield m.replace("/", "|") + "|" + tail


def segment(text, trie=None):
def segment(text: str, custom_dict: Trie = None) -> List[str]:
"""
ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
"""
if not text:
return []

return list(_multicut(text, trie=trie))
return list(_multicut(text, custom_dict=custom_dict))


def find_all_segment(text, trie=None):
def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
"""
ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
"""
if not text:
return []

ww = list(_multicut(text, trie=trie))
ww = list(_multicut(text, custom_dict=custom_dict))

return list(_combine(ww))
17 changes: 9 additions & 8 deletions pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from pythainlp.tokenize import DEFAULT_DICT_TRIE

from marisa_trie import Trie

from .tcc import tcc_pos

# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
Expand All @@ -39,7 +41,7 @@ def bfs_paths_graph(graph, start, goal):
queue.append((next, path + [next]))


def onecut(text: str, trie):
def onecut(text: str, custom_dict: Trie):
graph = defaultdict(list) # main data structure
allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc

Expand All @@ -48,7 +50,7 @@ def onecut(text: str, trie):
while q[0] < len(text):
p = heappop(q)

for w in trie.prefixes(text[p:]):
for w in custom_dict.prefixes(text[p:]):
p_ = p + len(w)
if p_ in allow_pos: # เลือกที่สอดคล้อง tcc
graph[p].append(p_)
Expand All @@ -74,7 +76,7 @@ def onecut(text: str, trie):
if i in allow_pos: # ใช้ tcc ด้วย
ww = [
w
for w in trie.prefixes(text[i:])
for w in custom_dict.prefixes(text[i:])
if (i + len(w) in allow_pos)
]
ww = [w for w in ww if not _PAT_TWOCHARS.match(w)]
Expand All @@ -90,12 +92,11 @@ def onecut(text: str, trie):
heappush(q, i)


# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
def segment(text: str, trie=None) -> List[str]:
def segment(text: str, custom_dict: Trie = None) -> List[str]:
if not text:
return []

if not trie:
trie = DEFAULT_DICT_TRIE
if not custom_dict:
custom_dict = DEFAULT_DICT_TRIE

return list(onecut(text, trie))
return list(onecut(text, custom_dict))
Loading