PyThaiNLP · bact · Nov 14, 2019 · Oct 12, 2019 · Oct 12, 2019 · Oct 13, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -18,7 +18,7 @@ before_install:
 - sudo rm -f /etc/boto.cfg
 
 install:
-- pip install "tensorflow>=1.14,<2" deepcut
+- pip install "tensorflow>=2,<3" deepcut
 - pip install -r requirements.txt
 - pip install .[full]
 - pip install coveralls

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp)
 [![Build status](https://ci.appveyor.com/api/projects/status/9g3mfcwchi8em40x?svg=true)](https://ci.appveyor.com/project/wannaphongcom/pythainlp-9y1ch)
 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=PyThaiNLP/pythainlp&amp;utm_campaign=Badge_Grade)
-[![Coverage Status](https://coveralls.io/repos/github/PyThaiNLP/pythainlp/badge.svg?branch=dev)](https://coveralls.io/github/PyThaiNLP/pythainlp?branch=dev) [![Google Colab Badge](https://badgen.net/badge/Launch%20Quick%20Start%20Guide/on%20Google%20Colab/blue?icon=terminal)](https://colab.research.google.com/github/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb)
+[![Coverage Status](https://coveralls.io/repos/github/PyThaiNLP/pythainlp/badge.svg?branch=dev)](https://coveralls.io/github/PyThaiNLP/pythainlp?branch=dev) [![Google Colab Badge](https://badgen.net/badge/Launch%20Quick%20Start%20Guide/on%20Google%20Colab/blue?icon=terminal)](https://colab.research.google.com/github/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp_get_started.ipynb)
 [![DOI](https://zenodo.org/badge/61813823.svg)](https://zenodo.org/badge/latestdoi/61813823)
 
 Thai Natural Language Processing in Python.
@@ -24,7 +24,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil
 **This is a document for development branch (post 2.0). Things will break.**
 
 - The latest stable release is [2.0.7](https://github.com/PyThaiNLP/pythainlp/releases)
-- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
+- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
 - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page
 
 
@@ -89,7 +89,7 @@ The data location can be changed, using `PYTHAINLP_DATA_DIR` environment variabl
 
 ## Documentation
 
-- [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb)
+- [PyThaiNLP Get Started](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html)
 - More tutorials at [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/)
 - See full documentation at [https://thainlp.org/pythainlp/docs/2.0/](https://thainlp.org/pythainlp/docs/2.0/)
 
@@ -198,7 +198,7 @@ pip install pythainlp[extra1,extra2,...]
 
 ## เอกสารการใช้งาน
 
-- [notebook เริ่มต้นใช้งาน PyThaiNLP](https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/pythainlp-get-started.ipynb)
+- [เริ่มต้นใช้งาน PyThaiNLP](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html)
 - สอนการใช้งานเพิ่มเติม ในรูปแบบ notebook [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/)
 - เอกสารตัวเต็ม [https://thainlp.org/pythainlp/docs/2.0/](https://thainlp.org/pythainlp/docs/2.0/)
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -98,8 +98,8 @@ install:
   - pip --version
   - pip install coveralls[yaml]
   - pip install coverage
-  - pip install "tensorflow>=1.14,<2" deepcut
-  - pip install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+  - pip install "tensorflow>=2,<3" deepcut
+  - pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
   - pip install %PYICU_PKG%
   - pip install %ARTAGGER_PKG%
   - pip install -e .[full]

diff --git a/docs/notes/pythainlp-1_7-2_0.rst b/docs/notes/pythainlp-1_7-2_0.rst
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -18,7 +18,7 @@
 _CORPUS_DB_URL = (
     "https://raw.githubusercontent.com/"
     + "PyThaiNLP/pythainlp-corpus/"
-    + "master/db.json"
+    + "2.1/db.json"
 )
 
 _CORPUS_DB_FILENAME = "db.json"
@@ -165,12 +165,12 @@ def _check_hash(dst: str, md5: str) -> NoReturn:
     @param: md5 place to hash the file (MD5)
     """
     if md5 and md5 != "-":
-        f = open(get_full_data_path(dst), "rb")
-        content = f.read()
-        file_md5 = hashlib.md5(content).hexdigest()
+        with open(get_full_data_path(dst), "rb") as f:
+            content = f.read()
+            file_md5 = hashlib.md5(content).hexdigest()
 
-        if md5 != file_md5:
-            raise Exception("Hash does not match expected.")
+            if md5 != file_md5:
+                raise Exception("Hash does not match expected.")
 
 
 def download(name: str, force: bool = False) -> NoReturn:

diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -76,10 +76,10 @@ def __init__(self):
         """
         Thai named-entity recognizer
         """
-        self.__data_path = get_corpus_path("thainer-1-2")
+        self.__data_path = get_corpus_path("thainer-1-3")
         if not self.__data_path:
-            download("thainer-1-2")
-            self.__data_path = get_corpus_path("thainer-1-2")
+            download("thainer-1-3")
+            self.__data_path = get_corpus_path("thainer-1-3")
         self.crf = sklearn_crfsuite.CRF(
             algorithm="lbfgs",
             c1=0.1,

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -33,6 +33,8 @@ def word_tokenize(
     **Options for engine**
         * *newmm* (default) - dictionary-based, Maximum Matching +
           Thai Character Cluster
+        * *newmm-safe* - newmm, with a mechanism to avoid long
+          processing time for some long continuous text without spaces
         * *longest* - dictionary-based, Longest Matching
         * *icu* - wrapper for ICU (International Components for Unicode,
           using PyICU), dictionary-based
@@ -101,10 +103,15 @@ def word_tokenize(
         return []
 
     segments = []
+
     if engine == "newmm" or engine == "onecut":
         from .newmm import segment
 
         segments = segment(text, custom_dict)
+    elif engine == "newmm-safe":
+        from .newmm import segment
+
+        segments = segment(text, custom_dict, safe_mode=True)
     elif engine == "attacut":
         from .attacut import segment
 
@@ -157,6 +164,7 @@ def dict_word_tokenize(
     :param bool keep_whitespace: True to keep whitespaces, a common mark
                                  for end of phrase in Thai
     :return: list of words
+    :rtype: list[str]
     """
     warnings.warn(
         "dict_word_tokenize is deprecated. Use word_tokenize with a custom_dict argument instead.",
@@ -336,6 +344,7 @@ def syllable_tokenize(text: str, engine: str = "default") -> List[str]:
             tokens.extend(word_tokenize(text=word, custom_dict=trie))
     else:
         from .ssg import segment
+
         tokens = segment(text)
 
     return tokens
@@ -345,9 +354,10 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
     """
     Create a dictionary trie which will be used for word_tokenize() function.
 
-    :param string/list dict_source: a list of vocaburaries or a path
-                                    to source file
-    :return: a trie created from a dictionary input
+    :param str|Iterable[str]|pythainlp.tokenize.Trie dict_source: a path to
+        dictionary file or a list of words or a pythainlp.tokenize.Trie object
+    :return: a trie object created from a dictionary input
+    :rtype: pythainlp.tokenize.Trie
     """
     trie = None
 
@@ -359,7 +369,9 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
             _vocabs = f.read().splitlines()
             trie = Trie(_vocabs)
     elif isinstance(dict_source, Iterable):
-        # Note: Trie and str are both Iterable, Iterable check should be here
+        # Note: Since Trie and str are both Iterable,
+        # so the Iterable check should be here, at the very end,
+        # because it has less specificality
         # Received a sequence type object of vocabs
         trie = Trie(dict_source)
     else:
@@ -435,7 +447,9 @@ class Tokenizer:
     """
 
     def __init__(
-        self, custom_dict: Union[Trie, Iterable[str], str] = None, engine: str = "newmm"
+        self,
+        custom_dict: Union[Trie, Iterable[str], str] = None,
+        engine: str = "newmm",
     ):
         """
         Initialize tokenizer object
@@ -458,7 +472,9 @@ def word_tokenize(self, text: str) -> List[str]:
         :return: list of words, tokenized from the text
         :rtype: list[str]
         """
-        return word_tokenize(text, custom_dict=self.__trie_dict, engine=self.__engine)
+        return word_tokenize(
+            text, custom_dict=self.__trie_dict, engine=self.__engine
+        )
 
     def set_tokenize_engine(self, engine: str) -> None:
         """

diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Enhanced Thai Character Cluster (ETCC)
+Enhanced Thai Character Cluster (ETCC) (In progress)
 Python implementation by Wannaphong Phatthiyaphaibun (19 June 2017)
 
 :See Also:
@@ -75,5 +75,4 @@ def segment(text: str) -> str:
             text = re.sub(i, ii + "/", text)
 
     text = re.sub("//", "/", text)
-
     return text.split("/")