From efcf7504a484996311a6b239b9fe1e261a30c19e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 3 May 2019 21:26:37 +0700 Subject: [PATCH 1/7] add return type --- pythainlp/corpus/__init__.py | 20 +++++++++++++------- pythainlp/spell/pn.py | 2 +- tests/__init__.py | 1 + 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 46e557546..036dbf9d7 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +from typing import NoReturn, Union from urllib.request import urlopen import requests @@ -14,9 +15,9 @@ _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME) _CORPUS_DB_URL = ( - "https://raw.githubusercontent.com/" + - "PyThaiNLP/pythainlp-corpus/" + - "master/db.json" + "https://raw.githubusercontent.com/" + + "PyThaiNLP/pythainlp-corpus/" + + "master/db.json" ) _CORPUS_DB_FILENAME = "db.json" @@ -51,7 +52,7 @@ def get_corpus(filename: str) -> frozenset: return frozenset(lines) -def get_corpus_path(name: str) -> [str, None]: +def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path @@ -72,18 +73,21 @@ def get_corpus_path(name: str) -> [str, None]: return None -def _download(url: str, dst: str): +def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ file_size = int(urlopen(url).info().get("Content-Length", -1)) + if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 + if first_byte >= file_size: return file_size + header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, @@ -99,10 +103,11 @@ def _download(url: str, dst: str): f.write(chunk) pbar.update(1024) pbar.close() - # return file_size + return file_size -def download(name: str, force: bool = False): + +def download(name: str, force: bool = False) -> NoReturn: """ Download corpus @@ -113,6 +118,7 @@ def download(name: str, force: bool = False): temp = Query() data = requests.get(corpus_db_url()) data_json = data.json() + if name in list(data_json.keys()): temp_name = data_json[name] print("Download:", name) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 2d509cb15..eccbb27d1 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -34,7 +34,7 @@ def _keep( min_len: int, max_len: int, dict_filter: Callable[[str], bool], -): +) -> Callable[[str], bool]: """ Keep only Thai words with at least min_freq frequency and has length between min_len and max_len characters diff --git a/tests/__init__.py b/tests/__init__.py index 4ffcaf2c8..1c2e217d0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -577,6 +577,7 @@ def test_thai_strftime(self): thai_strftime(date, "%Aที่ %d %B พ.ศ. %Y เวลา %H:%Mน. (%a %d-%b-%y) %% %"), "วันพุธที่ 06 ตุลาคม พ.ศ. 2519 เวลา 01:40น. (พ 06-ต.ค.-19) % %", ) + self.assertIsNotNone(thai_strftime(date, "%A%a%B%b%C%c%D%F%G%g%v%X%x%Y%y%+")) # ### pythainlp.util.normalize From 78593c1a31db24fc8fa4e56e8b5f13c7aff88eb6 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 3 May 2019 21:38:25 +0700 Subject: [PATCH 2/7] Test with Python 3.7 on Windows --- appveyor.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 9e2100953..3029e29cd 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -6,6 +6,13 @@ environment: PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" + + - PYTHON: "C:/Python37-x64" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "64" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" @@ -19,10 +26,10 @@ install: - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install coveralls[yaml]" - "%PYTHON%/python.exe -m pip install coverage" - - "%PYTHON%/python.exe -m pip install https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" + - "%PYTHON%/python.exe -m pip install %PYICU_PKG%" - "%PYTHON%/python.exe -m pip install %ARTAGGER_PKG%" - "%PYTHON%/python.exe -m pip install -e .[artagger,icu,ipa,ner,thai2fit,deepcut]" test_script: - "%PYTHON%/python.exe -m pip --version" - - "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test" \ No newline at end of file + - "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test" From 70b8a8494b5ed2181fa4a7e180481e8ec2bd7645 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 3 May 2019 22:15:26 +0700 Subject: [PATCH 3/7] prepare for PyICU version update and Python 3.7 test --- appveyor.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 3029e29cd..9c7b7bca0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,17 +2,23 @@ build: off environment: matrix: +# - PYTHON: "C:/Python36" +# PYTHON_VERSION: "3.6" +# PYTHON_ARCH: "32" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "" + - PYTHON: "C:/Python36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" PYICU_PKG: "https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" - - PYTHON: "C:/Python37-x64" - PYTHON_VERSION: "3.7" - PYTHON_ARCH: "64" - ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" - PYICU_PKG: "https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" +# - PYTHON: "C:/Python37-x64" +# PYTHON_VERSION: "3.7" +# PYTHON_ARCH: "64" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" From bd5683153c5fd68a3892965b3561d4e695c4961c Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 5 May 2019 00:48:05 +0700 Subject: [PATCH 4/7] Add Python 3.7 (win32, amd64) and 3.6 (win32) test environments --- appveyor.yml | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 9c7b7bca0..86156651a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,24 +1,33 @@ +# Unofficial Windows Binaries for Python Extension Packages +# from https://www.lfd.uci.edu/~gohlke/pythonlibs/ + build: off environment: matrix: -# - PYTHON: "C:/Python36" -# PYTHON_VERSION: "3.6" -# PYTHON_ARCH: "32" -# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" -# PYICU_PKG: "" + - PYTHON: "C:/Python36" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "32" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1" - PYTHON: "C:/Python36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" - PYICU_PKG: "https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" + PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1" -# - PYTHON: "C:/Python37-x64" -# PYTHON_VERSION: "3.7" -# PYTHON_ARCH: "64" -# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" -# PYICU_PKG: "" + - PYTHON: "C:/Python37" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "32" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1" + + - PYTHON: "C:/Python37-x64" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "64" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" From 38557efbfe85887c14352ea34a9337b776369781 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 5 May 2019 00:50:36 +0700 Subject: [PATCH 5/7] fix indent --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 86156651a..9b1ab7cd0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,7 +17,7 @@ environment: ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1" - - PYTHON: "C:/Python37" + - PYTHON: "C:/Python37" PYTHON_VERSION: "3.7" PYTHON_ARCH: "32" ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" From fcadc2fd92c1bc61a536d853d7376013882c802b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 5 May 2019 00:55:56 +0700 Subject: [PATCH 6/7] Test only amd64 --- appveyor.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 9b1ab7cd0..4e6f5b222 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,15 +1,15 @@ -# Unofficial Windows Binaries for Python Extension Packages -# from https://www.lfd.uci.edu/~gohlke/pythonlibs/ +# Use unofficial Windows Binaries for Python Extension Packages from +# https://www.lfd.uci.edu/~gohlke/pythonlibs/ build: off environment: matrix: - - PYTHON: "C:/Python36" - PYTHON_VERSION: "3.6" - PYTHON_ARCH: "32" - ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" - PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1" +# - PYTHON: "C:/Python36" +# PYTHON_VERSION: "3.6" +# PYTHON_ARCH: "32" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1" - PYTHON: "C:/Python36-x64" PYTHON_VERSION: "3.6" @@ -17,11 +17,11 @@ environment: ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1" - - PYTHON: "C:/Python37" - PYTHON_VERSION: "3.7" - PYTHON_ARCH: "32" - ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" - PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1" +# - PYTHON: "C:/Python37" +# PYTHON_VERSION: "3.7" +# PYTHON_ARCH: "32" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1" - PYTHON: "C:/Python37-x64" PYTHON_VERSION: "3.7" @@ -31,7 +31,7 @@ environment: init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" - - ps: "ls C:/Python*" +# - ps: "ls C:/Python*" install: - "chcp 65001" From 1f56415c771e770c38559178399fbfcf85b0c54d Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 5 May 2019 01:13:41 +0700 Subject: [PATCH 7/7] fix deprecated inline flag --- pythainlp/corpus/tnc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 651e704a1..4ce7a3337 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -48,7 +48,7 @@ def word_freq(word: str, domain: str = "all") -> int: r = requests.post(url, data=data) - pat = re.compile(r'TOTAL(?s).*?#ffffff">(.*?)') + pat = re.compile(r'TOTAL.*?#ffffff">(.*?)', flags=re.DOTALL) match = pat.search(r.text) n = 0