Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,37 @@
# Use unofficial Windows Binaries for Python Extension Packages from
# https://www.lfd.uci.edu/~gohlke/pythonlibs/

build: off

environment:
matrix:
# - PYTHON: "C:/Python36"
# PYTHON_VERSION: "3.6"
# PYTHON_ARCH: "32"
# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger"
# PYICU_PKG: "https://www.dropbox.com/s/pahorbq29y9cura/PyICU-2.3.1-cp36-cp36m-win32.whl?dl=1"

- PYTHON: "C:/Python36-x64"
PYTHON_VERSION: "3.6"
PYTHON_ARCH: "64"
ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger"
PYICU_PKG: "https://www.dropbox.com/s/7t0rrxwckqbgivi/PyICU-2.3.1-cp36-cp36m-win_amd64.whl?dl=1"

# - PYTHON: "C:/Python37"
# PYTHON_VERSION: "3.7"
# PYTHON_ARCH: "32"
# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger"
# PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1"

- PYTHON: "C:/Python37-x64"
PYTHON_VERSION: "3.7"
PYTHON_ARCH: "64"
ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger"
PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1"

init:
- "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
- ps: "ls C:/Python*"
# - ps: "ls C:/Python*"

install:
- "chcp 65001"
Expand All @@ -19,10 +41,10 @@ install:
- "%PYTHON%/python.exe -m pip install --upgrade pip"
- "%PYTHON%/python.exe -m pip install coveralls[yaml]"
- "%PYTHON%/python.exe -m pip install coverage"
- "%PYTHON%/python.exe -m pip install https://www.dropbox.com/s/g84479l8yhv5ohi/PyICU-2.2-cp36-cp36m-win_amd64.whl?dl=1"
- "%PYTHON%/python.exe -m pip install %PYICU_PKG%"
- "%PYTHON%/python.exe -m pip install %ARTAGGER_PKG%"
- "%PYTHON%/python.exe -m pip install -e .[artagger,icu,ipa,ner,thai2fit,deepcut]"

test_script:
- "%PYTHON%/python.exe -m pip --version"
- "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test"
- "%PYTHON%/python.exe -m coverage run --source=pythainlp setup.py test"
20 changes: 13 additions & 7 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

import os
from typing import NoReturn, Union
from urllib.request import urlopen

import requests
Expand All @@ -14,9 +15,9 @@
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

_CORPUS_DB_URL = (
"https://raw.githubusercontent.com/" +
"PyThaiNLP/pythainlp-corpus/" +
"master/db.json"
"https://raw.githubusercontent.com/"
+ "PyThaiNLP/pythainlp-corpus/"
+ "master/db.json"
)

_CORPUS_DB_FILENAME = "db.json"
Expand Down Expand Up @@ -51,7 +52,7 @@ def get_corpus(filename: str) -> frozenset:
return frozenset(lines)


def get_corpus_path(name: str) -> [str, None]:
def get_corpus_path(name: str) -> Union[str, None]:
"""
Get corpus path

Expand All @@ -72,18 +73,21 @@ def get_corpus_path(name: str) -> [str, None]:
return None


def _download(url: str, dst: str):
def _download(url: str, dst: str) -> int:
"""
@param: url to download file
@param: dst place to put the file
"""
file_size = int(urlopen(url).info().get("Content-Length", -1))

if os.path.exists(dst):
first_byte = os.path.getsize(dst)
else:
first_byte = 0

if first_byte >= file_size:
return file_size

header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
pbar = tqdm(
total=file_size,
Expand All @@ -99,10 +103,11 @@ def _download(url: str, dst: str):
f.write(chunk)
pbar.update(1024)
pbar.close()
# return file_size

return file_size

def download(name: str, force: bool = False):

def download(name: str, force: bool = False) -> NoReturn:
"""
Download corpus

Expand All @@ -113,6 +118,7 @@ def download(name: str, force: bool = False):
temp = Query()
data = requests.get(corpus_db_url())
data_json = data.json()

if name in list(data_json.keys()):
temp_name = data_json[name]
print("Download:", name)
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/corpus/tnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def word_freq(word: str, domain: str = "all") -> int:

r = requests.post(url, data=data)

pat = re.compile(r'TOTAL</font>(?s).*?#ffffff">(.*?)</font>')
pat = re.compile(r'TOTAL</font>.*?#ffffff">(.*?)</font>', flags=re.DOTALL)
match = pat.search(r.text)

n = 0
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/spell/pn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _keep(
min_len: int,
max_len: int,
dict_filter: Callable[[str], bool],
):
) -> Callable[[str], bool]:
"""
Keep only Thai words with at least min_freq frequency
and has length between min_len and max_len characters
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ def test_thai_strftime(self):
thai_strftime(date, "%Aที่ %d %B พ.ศ. %Y เวลา %H:%Mน. (%a %d-%b-%y) %% %"),
"วันพุธที่ 06 ตุลาคม พ.ศ. 2519 เวลา 01:40น. (พ 06-ต.ค.-19) % %",
)
self.assertIsNotNone(thai_strftime(date, "%A%a%B%b%C%c%D%F%G%g%v%X%x%Y%y%+"))

# ### pythainlp.util.normalize

Expand Down