diff --git a/appveyor.yml b/appveyor.yml index 9e2100953..4e6f5b222 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,15 +1,37 @@ +# Use unofficial Windows Binaries for Python Extension Packages from +# https://www.lfd.uci.edu/~gohlke/pythonlibs/ + build: off environment: matrix: +# - PYTHON: "C:/Python36" +# PYTHON_VERSION: "3.6" +# PYTHON_ARCH: "32" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1" + - PYTHON: "C:/Python36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1" + +# - PYTHON: "C:/Python37" +# PYTHON_VERSION: "3.7" +# PYTHON_ARCH: "32" +# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" +# PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1" + + - PYTHON: "C:/Python37-x64" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "64" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" - - ps: "ls C:/Python*" +# - ps: "ls C:/Python*" install: - "chcp 65001" @@ -19,10 +41,10 @@ install: - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install coveralls[yaml]" - "%PYTHON%/python.exe -m pip install coverage" - - "%PYTHON%/python.exe -m pip install https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1" + - "%PYTHON%/python.exe -m pip install %PYICU_PKG%" - "%PYTHON%/python.exe -m pip install %ARTAGGER_PKG%" - "%PYTHON%/python.exe -m pip install -e .[artagger,icu,ipa,ner,thai2fit,deepcut]" test_script: - "%PYTHON%/python.exe -m pip --version" - - "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test" \ No newline at end of file + - "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test" diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 46e557546..036dbf9d7 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +from typing import NoReturn, Union from urllib.request import urlopen import requests @@ -14,9 +15,9 @@ _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME) _CORPUS_DB_URL = ( - "https://raw.githubusercontent.com/" + - "PyThaiNLP/pythainlp-corpus/" + - "master/db.json" + "https://raw.githubusercontent.com/" + + "PyThaiNLP/pythainlp-corpus/" + + "master/db.json" ) _CORPUS_DB_FILENAME = "db.json" @@ -51,7 +52,7 @@ def get_corpus(filename: str) -> frozenset: return frozenset(lines) -def get_corpus_path(name: str) -> [str, None]: +def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path @@ -72,18 +73,21 @@ def get_corpus_path(name: str) -> [str, None]: return None -def _download(url: str, dst: str): +def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ file_size = int(urlopen(url).info().get("Content-Length", -1)) + if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 + if first_byte >= file_size: return file_size + header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, @@ -99,10 +103,11 @@ def _download(url: str, dst: str): f.write(chunk) pbar.update(1024) pbar.close() - # return file_size + return file_size -def download(name: str, force: bool = False): + +def download(name: str, force: bool = False) -> NoReturn: """ Download corpus @@ -113,6 +118,7 @@ def download(name: str, force: bool = False): temp = Query() data = requests.get(corpus_db_url()) data_json = data.json() + if name in list(data_json.keys()): temp_name = data_json[name] print("Download:", name) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 651e704a1..4ce7a3337 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -48,7 +48,7 @@ def word_freq(word: str, domain: str = "all") -> int: r = requests.post(url, data=data) - pat = re.compile(r'TOTAL(?s).*?#ffffff">(.*?)') + pat = re.compile(r'TOTAL.*?#ffffff">(.*?)', flags=re.DOTALL) match = pat.search(r.text) n = 0 diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 2d509cb15..eccbb27d1 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -34,7 +34,7 @@ def _keep( min_len: int, max_len: int, dict_filter: Callable[[str], bool], -): +) -> Callable[[str], bool]: """ Keep only Thai words with at least min_freq frequency and has length between min_len and max_len characters diff --git a/tests/__init__.py b/tests/__init__.py index 4ffcaf2c8..1c2e217d0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -577,6 +577,7 @@ def test_thai_strftime(self): thai_strftime(date, "%Aที่ %d %B พ.ศ. %Y เวลา %H:%Mน. (%a %d-%b-%y) %% %"), "วันพุธที่ 06 ตุลาคม พ.ศ. 2519 เวลา 01:40น. (พ 06-ต.ค.-19) % %", ) + self.assertIsNotNone(thai_strftime(date, "%A%a%B%b%C%c%D%F%G%g%v%X%x%Y%y%+")) # ### pythainlp.util.normalize