From 797caefde80ab6513861e228a8aa1e379396d274 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 14:42:19 +0700 Subject: [PATCH 1/6] deepcut & dict_word_tokenize --- pythainlp/tokenize/__init__.py | 8 ++++++++ pythainlp/tokenize/deepcut.py | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 94e952fac..8e3756429 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -9,6 +9,7 @@ from marisa_trie import Trie +DICT_LIST=thai_words() DEFAULT_DICT_TRIE = Trie(thai_words()) FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) @@ -80,6 +81,7 @@ def dict_word_tokenize( >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ + global DICT_LIST if not text: return [] @@ -90,6 +92,9 @@ def dict_word_tokenize( from .longest import segment elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment + elif engine == "deepcut": + from .deepcut import segment + return segment(text,DICT_LIST) else: # default, use "newmm" engine from .newmm import segment @@ -171,14 +176,17 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie: :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input """ + global DICT_LIST if type(dict_source) is str: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() + DICT_LIST=_vocabs return Trie(_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs + _vocabs=dict_source return Trie(dict_source) else: raise TypeError( diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index a3844c2f3..31636e06b 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -8,5 +8,7 @@ import deepcut -def segment(text: str) -> List[str]: +def segment(text: str,dict_source:List[str]=None) -> List[str]: + if dict_source!=None: + return deepcut.tokenize(text, custom_dict=dict_source) return deepcut.tokenize(text) From 6e38de0dab1b6112ee7d355a41e69226ce2aade7 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 14:57:34 +0700 Subject: [PATCH 2/6] update dict trie from Trie to (Trie,List) --- pythainlp/tokenize/__init__.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 8e3756429..e17579c5a 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -81,7 +81,6 @@ def dict_word_tokenize( >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ - global DICT_LIST if not text: return [] @@ -94,11 +93,11 @@ def dict_word_tokenize( from .multi_cut import segment elif engine == "deepcut": from .deepcut import segment - return segment(text,DICT_LIST) + return segment(text,custom_dict[1]) else: # default, use "newmm" engine from .newmm import segment - return segment(text, custom_dict) + return segment(text, custom_dict[0]) def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: @@ -176,18 +175,15 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie: :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input """ - global DICT_LIST if type(dict_source) is str: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() - DICT_LIST=_vocabs - return Trie(_vocabs) + return (Trie(_vocabs),_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs - _vocabs=dict_source - return Trie(dict_source) + return (Trie(dict_source),dict_source) else: raise TypeError( "Type of dict_source must be either str (path to source file) or iterable" From 1a3338b74acf8276ede5753d4d55a1d354d76252 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 15:09:11 +0700 Subject: [PATCH 3/6] fix bug test --- pythainlp/tokenize/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index e17579c5a..f32f48f54 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -9,7 +9,6 @@ from marisa_trie import Trie -DICT_LIST=thai_words() DEFAULT_DICT_TRIE = Trie(thai_words()) FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) @@ -96,8 +95,10 @@ def dict_word_tokenize( return segment(text,custom_dict[1]) else: # default, use "newmm" engine from .newmm import segment - - return segment(text, custom_dict[0]) + if type(custom_dict) is tuple: + return segment(text, custom_dict[0]) + else: + return segment(text, custom_dict) def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: From 0b1817f4d00a8de5ebadfc226e075fddf55e7cea Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 15:21:25 +0700 Subject: [PATCH 4/6] Trie to list for deepcut --- pythainlp/tokenize/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index f32f48f54..82566f5c9 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -92,13 +92,10 @@ def dict_word_tokenize( from .multi_cut import segment elif engine == "deepcut": from .deepcut import segment - return segment(text,custom_dict[1]) + return segment(text,list(custom_dict)) else: # default, use "newmm" engine from .newmm import segment - if type(custom_dict) is tuple: - return segment(text, custom_dict[0]) - else: - return segment(text, custom_dict) + return segment(text, custom_dict) def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: @@ -181,10 +178,10 @@ def dict_trie(dict_source: Union[str, Iterable]) -> Trie: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() - return (Trie(_vocabs),_vocabs) + return Trie(_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs - return (Trie(dict_source),dict_source) + return Trie(dict_source) else: raise TypeError( "Type of dict_source must be either str (path to source file) or iterable" From 1b5109e3d7079ef91a1935d68830c40ef8791044 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 15:24:58 +0700 Subject: [PATCH 5/6] update docs --- pythainlp/tokenize/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 82566f5c9..40ae585c7 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -71,7 +71,7 @@ def dict_word_tokenize( :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized :param dict custom_dict: a dictionary trie - :param str engine: choose between different options of engine to token (newmm, longest) + :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut) :return: list of words **Example**:: >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie From 3ab9dc40894fb0c6d867050b8f3076466617053a Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 15 Apr 2019 15:40:46 +0700 Subject: [PATCH 6/6] =?UTF-8?q?del=20=E0=B9=92,=E0=B9=95=E0=B9=94=E0=B9=90?= =?UTF-8?q?=20=E0=B8=A3=E0=B8=B2=E0=B8=A2=E0=B8=81=E0=B8=B2=E0=B8=A3=20fro?= =?UTF-8?q?m=20=20words=5Fth.txt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pythainlp/corpus/words_th.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt index 4e6259387..0fa96af67 100755 --- a/pythainlp/corpus/words_th.txt +++ b/pythainlp/corpus/words_th.txt @@ -61186,7 +61186,6 @@ แอกน้อย แอด ๆ แอบ ๆ -๒,๕๔๐ รายการ โอ้กอ้าก โอฆ โอฆชล