diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt index 4e6259387..0fa96af67 100755 --- a/pythainlp/corpus/words_th.txt +++ b/pythainlp/corpus/words_th.txt @@ -61186,7 +61186,6 @@ แอกน้อย แอด ๆ แอบ ๆ -๒,๕๔๐ รายการ โอ้กอ้าก โอฆ โอฆชล diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 94e952fac..40ae585c7 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -71,7 +71,7 @@ def dict_word_tokenize( :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized :param dict custom_dict: a dictionary trie - :param str engine: choose between different options of engine to token (newmm, longest) + :param str engine: choose between different options of engine to token (newmm, mm, longest and deepcut) :return: list of words **Example**:: >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie @@ -90,9 +90,11 @@ def dict_word_tokenize( from .longest import segment elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment + elif engine == "deepcut": + from .deepcut import segment + return segment(text,list(custom_dict)) else: # default, use "newmm" engine from .newmm import segment - return segment(text, custom_dict) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index a3844c2f3..31636e06b 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -8,5 +8,7 @@ import deepcut -def segment(text: str) -> List[str]: +def segment(text: str,dict_source:List[str]=None) -> List[str]: + if dict_source!=None: + return deepcut.tokenize(text, custom_dict=dict_source) return deepcut.tokenize(text)