From 2b2ef0bdb9f71d33bf7a04d42289e4560af298a0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 15 Apr 2019 18:53:48 +0200 Subject: [PATCH 1/3] Makes custom dictionary arguments more consistent across different engine. Handles Trie, Iterable[str], and str (path to dictionary). --- pythainlp/tokenize/__init__.py | 60 +++++++++++++++++++++++---------- pythainlp/tokenize/deepcut.py | 17 +++++++--- pythainlp/tokenize/longest.py | 18 ++++++---- pythainlp/tokenize/multi_cut.py | 27 +++++++++------ pythainlp/tokenize/newmm.py | 17 +++++----- 5 files changed, 92 insertions(+), 47 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 40ae585c7..6587e9bb7 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -47,7 +47,7 @@ def word_tokenize( from .newmm import segment as segment_ def segment(text): - return segment_(text, trie=FROZEN_DICT_TRIE) + return segment_(text, custom_dict=FROZEN_DICT_TRIE) elif engine == "icu": from .pyicu import segment @@ -58,20 +58,26 @@ def segment(text): else: # default, use "newmm" engine from .newmm import segment - if not whitespaces: - return [token.strip(" ") for token in segment(text) if token.strip(" ")] + segments = segment(text) - return segment(text) + if whitespaces: + return segments + + return [token.strip(" ") for token in segments if token.strip(" ")] def dict_word_tokenize( - text: str, custom_dict: Trie, engine: str = "newmm" + text: str, + custom_dict: Union[Trie, Iterable[str], str] = DEFAULT_DICT_TRIE, + engine: str = "newmm", + whitespaces: bool = True, ) -> List[str]: """ :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized - :param dict custom_dict: a dictionary trie - :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut) + :param dict custom_dict: a dictionary trie, or an iterable of words, or a string of dictionary path + :param str engine: choose between different options of engine to token (newmm [default], mm, longest, and deepcut) + :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai :return: list of words **Example**:: >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie @@ -86,16 +92,32 @@ def dict_word_tokenize( if engine == "newmm" or engine == "onecut": from .newmm import segment + + custom_dict = dict_trie(custom_dict) elif engine == "longest" or engine == "longest-matching": from .longest import segment + + custom_dict = dict_trie(custom_dict) elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment + + custom_dict = dict_trie(custom_dict) elif engine == "deepcut": from .deepcut import segment - return segment(text,list(custom_dict)) + + if not isinstance(custom_dict, List) and not isinstance(custom_dict, str): + custom_dict = list(custom_dict) else: # default, use "newmm" engine from .newmm import segment - return segment(text, custom_dict) + + custom_dict = dict_trie(custom_dict) + + segments = segment(text, custom_dict) + + if whitespaces: + return segments + + return [token.strip(" ") for token in segments if token.strip(" ")] def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: @@ -135,11 +157,8 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: if engine == "etcc": from .etcc import segment - - return segment(text) - - # default is "tcc" - from .tcc import segment + else: # default + from .tcc import segment return segment(text) @@ -164,7 +183,7 @@ def syllable_tokenize(text: str) -> List[str]: return tokens -def dict_trie(dict_source: Union[str, Iterable]) -> Trie: +def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: """ Create a dict trie which will be used for word_tokenize() function. For more information on the trie data structure, @@ -173,20 +192,25 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie: :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input """ + trie = None if type(dict_source) is str: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() - return Trie(_vocabs) + trie = Trie(_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs - return Trie(dict_source) + trie = Trie(dict_source) + elif isinstance(dict_source, Trie): + trie = dict_source else: raise TypeError( - "Type of dict_source must be either str (path to source file) or iterable" + "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)" ) + return trie + class Tokenizer: def __init__( diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 31636e06b..f3ec1efb4 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -3,12 +3,21 @@ Wrapper for deepcut Thai word segmentation """ -from typing import List +from typing import List, Union import deepcut +from marisa_trie import Trie + + +def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]: + if not text: + return [] + + if custom_dict: + if isinstance(custom_dict, Trie): + custom_dict = list(custom_dict) + + return deepcut.tokenize(text, custom_dict) -def segment(text: str,dict_source:List[str]=None) -> List[str]: - if dict_source!=None: - return deepcut.tokenize(text, custom_dict=dict_source) return deepcut.tokenize(text) diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 83ce495a1..db0bf889c 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -6,9 +6,12 @@ https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py """ import re +from typing import List from pythainlp.tokenize import DEFAULT_DICT_TRIE +from marisa_trie import Trie + _FRONT_DEP_CHAR = [ "ะ", "ั", @@ -36,7 +39,7 @@ class LongestMatchTokenizer(object): - def __init__(self, trie): + def __init__(self, trie: Trie): self.__trie = trie def __search_nonthai(self, text: str): @@ -130,14 +133,17 @@ def __segment_text(self, text: str): return tokens - def tokenize(self, text): + def tokenize(self, text: str) -> List[str]: tokens = self.__segment_text(text) return tokens -def segment(text, trie=None): +def segment(text: str, custom_dict: Trie = None) -> List[str]: """ตัดคำภาษาไทยด้วยวิธี longest matching""" - if not trie: - trie = DEFAULT_DICT_TRIE + if not text: + return [] + + if not custom_dict: + custom_dict = DEFAULT_DICT_TRIE - return LongestMatchTokenizer(trie).tokenize(text) + return LongestMatchTokenizer(custom_dict).tokenize(text) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index d161bdf4e..5d1238336 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -8,9 +8,12 @@ """ import re from collections import defaultdict +from typing import List from pythainlp.tokenize import DEFAULT_DICT_TRIE +from marisa_trie import Trie + class LatticeString(str): """ @@ -40,13 +43,14 @@ def __init__(self, value, multi=None, in_dict=True): _PAT_ENG = re.compile(_RE_ENG) -def _multicut(text, trie=None): +def _multicut(text: str, custom_dict: Trie = None): """ ส่งคืน LatticeString คืนมาเป็นก้อนๆ """ + if not custom_dict: + custom_dict = DEFAULT_DICT_TRIE + len_text = len(text) - if not trie: - trie = DEFAULT_DICT_TRIE words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function @@ -64,7 +68,7 @@ def serialize(p, p2): # helper function p = min(q) q -= {p} # q.pop, but for set - for w in trie.prefixes(text[p:]): + for w in custom_dict.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) @@ -80,7 +84,7 @@ def serialize(p, p2): # helper function i = p + m.span()[1] else: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p, len_text): - ww = trie.prefixes(text[i:]) + ww = custom_dict.prefixes(text[i:]) m = _PAT_ENG.match(text[i:]) if ww or m: break @@ -93,7 +97,7 @@ def serialize(p, p2): # helper function q.add(i) -def mmcut(text): +def mmcut(text: str): res = [] for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) @@ -101,7 +105,7 @@ def mmcut(text): return res -def _combine(ww): +def _combine(ww: str): if ww == []: yield "" else: @@ -114,22 +118,23 @@ def _combine(ww): yield m.replace("/", "|") + "|" + tail -def segment(text, trie=None): +def segment(text: str, custom_dict: Trie = None) -> List[str]: """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ if not text: return [] - return list(_multicut(text, trie=trie)) + return list(_multicut(text, custom_dict=custom_dict)) -def find_all_segment(text, trie=None): +def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]: """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ if not text: return [] - ww = list(_multicut(text, trie=trie)) + ww = list(_multicut(text, custom_dict=custom_dict)) + return list(_combine(ww)) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 066ff1017..88b766eea 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -13,6 +13,8 @@ from pythainlp.tokenize import DEFAULT_DICT_TRIE +from marisa_trie import Trie + from .tcc import tcc_pos # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น @@ -39,7 +41,7 @@ def bfs_paths_graph(graph, start, goal): queue.append((next, path + [next])) -def onecut(text: str, trie): +def onecut(text: str, custom_dict: Trie): graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc @@ -48,7 +50,7 @@ def onecut(text: str, trie): while q[0] < len(text): p = heappop(q) - for w in trie.prefixes(text[p:]): + for w in custom_dict.prefixes(text[p:]): p_ = p + len(w) if p_ in allow_pos: # เลือกที่สอดคล้อง tcc graph[p].append(p_) @@ -74,7 +76,7 @@ def onecut(text: str, trie): if i in allow_pos: # ใช้ tcc ด้วย ww = [ w - for w in trie.prefixes(text[i:]) + for w in custom_dict.prefixes(text[i:]) if (i + len(w) in allow_pos) ] ww = [w for w in ww if not _PAT_TWOCHARS.match(w)] @@ -90,12 +92,11 @@ def onecut(text: str, trie): heappush(q, i) -# ช่วยให้ไม่ต้องพิมพ์ยาวๆ -def segment(text: str, trie=None) -> List[str]: +def segment(text: str, custom_dict: Trie = None) -> List[str]: if not text: return [] - if not trie: - trie = DEFAULT_DICT_TRIE + if not custom_dict: + custom_dict = DEFAULT_DICT_TRIE - return list(onecut(text, trie)) + return list(onecut(text, custom_dict)) From e84822151151f44bb689a21f5cc6c387ca6364b0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 15 Apr 2019 19:33:57 +0200 Subject: [PATCH 2/3] More test cases for dict_word_tokenize and deepcut --- tests/__init__.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index b2f7c711f..9a33a9902 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -29,6 +29,7 @@ from pythainlp.tag.locations import tag_provinces from pythainlp.tag.named_entity import ThaiNameTagger from pythainlp.tokenize import ( + DEFAULT_DICT_TRIE, FROZEN_DICT_TRIE, Tokenizer, dict_trie, @@ -43,6 +44,7 @@ tcc, word_tokenize, ) +from pythainlp.tokenize import deepcut as tokenize_deepcut from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.transliterate import romanize, transliterate from pythainlp.transliterate.ipa import trans_list, xsampa_list @@ -305,6 +307,7 @@ def test_dict_word_tokenize(self): "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest", + whitespaces=False, ) ) self.assertIsNotNone( @@ -351,10 +354,15 @@ def test_word_tokenize_icu(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) - # def test_word_tokenize_deepcut(self): - # self.assertEqual(deepcut.segment(None), []) - # self.assertEqual(deepcut.segment(""), []) - # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) + def test_word_tokenize_deepcut(self): + self.assertEqual(tokenize_deepcut.segment(None), []) + self.assertEqual(tokenize_deepcut.segment(""), []) + self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", DEFAULT_DICT_TRIE)) + self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) + self.assertIsNotNone(dict_word_tokenize("ทดสอบ", engine="deepcut")) + self.assertIsNotNone( + dict_word_tokenize("ทดสอบ", engine="deepcut", custom_dict=["ทด", "สอบ"]) + ) def test_word_tokenize_longest_matching(self): self.assertEqual(longest.segment(None), []) @@ -405,9 +413,10 @@ def test_sent_tokenize(self): self.assertEqual(sent_tokenize("รักน้ำ รักปลา "), ["รักน้ำ", "รักปลา"]) def test_subword_tokenize(self): - self.assertEqual(subword_tokenize(None), "") - self.assertEqual(subword_tokenize(""), "") - self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร")) + self.assertEqual(subword_tokenize(None), []) + self.assertEqual(subword_tokenize(""), []) + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc")) + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="etcc")) def test_syllable_tokenize(self): self.assertEqual(syllable_tokenize(None), []) From a5525c374425a9062fda489911c3018211e83f77 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 15 Apr 2019 19:47:22 +0200 Subject: [PATCH 3/3] if input is empty, subword_tokenize() should return empty list. --- pythainlp/tokenize/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 6587e9bb7..d8cc6bafe 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -153,7 +153,7 @@ def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: :return: a list of tokenized strings. """ if not text: - return "" + return [] if engine == "etcc": from .etcc import segment