diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 18a3bfdb8..b8361f789 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -1,4 +1,78 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import,unicode_literals +from pythainlp.tools import get_path_db,get_path_data +from tinydb import TinyDB,Query +from future.moves.urllib.request import urlopen +from tqdm import tqdm +import requests +import os +import math +import requests from nltk.corpus import names -#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] \ No newline at end of file +#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] +path_db_=get_path_db() +def get_file(name): + db=TinyDB(path_db_) + temp = Query() + if len(db.search(temp.name==name))>0: + path= get_path_data(db.search(temp.name==name)[0]['file']) + db.close() + if not os.path.exists(path): + download(name) + return path +def download_(url, dst): + """ + @param: url to download file + @param: dst place to put the file + """ + file_size = int(urlopen(url).info().get('Content-Length', -1)) + if os.path.exists(dst): + first_byte = os.path.getsize(dst) + else: + first_byte = 0 + if first_byte >= file_size: + return file_size + header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} + pbar = tqdm( + total=file_size, initial=first_byte, + unit='B', unit_scale=True, desc=url.split('/')[-1]) + req = requests.get(url, headers=header, stream=True) + with(open(get_path_data(dst), 'wb')) as f: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + pbar.update(1024) + pbar.close() + #return file_size +def download(name,force=False): + db=TinyDB(path_db_) + temp = Query() + data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json") + data_json=data.json() + if name in list(data_json.keys()): + temp_name=data_json[name] + print("Download : "+name) + if len(db.search(temp.name==name))==0: + print(name+" "+temp_name['version']) + download_(temp_name['download'],temp_name['file_name']) + db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']}) + else: + if len(db.search(temp.name==name and temp.version==temp_name['version']))==0: + print("have update") + print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) + yes_no="y" + if force==False: + yes_no=str(input("y or n : ")).lower() + if "y"==yes_no: + download_(temp_name['download'],temp_name['file_name']) + db.update({'version':temp_name['version']},temp.name==name) + else: + print("re-download") + print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) + yes_no="y" + if force==False: + yes_no=str(input("y or n : ")).lower() + if "y"==yes_no: + download_(temp_name['download'],temp_name['file_name']) + db.update({'version':temp_name['version']},temp.name==name) + db.close() \ No newline at end of file diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 8f202b364..83847e460 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -4,11 +4,25 @@ import dill from pythainlp.tokenize import tcc import marisa_trie +def get_path_db(): + path = os.path.join(get_path_pythainlp_data(), "db.json") + if not os.path.exists(path): + from tinydb import TinyDB + db=TinyDB(path) + #db.insert({'name': 'hi', 'version': '0.1','file':''}) + return path +def get_path_data(filename): + return os.path.join(get_path_pythainlp_data(), filename) +def get_path_pythainlp_data(): + path= os.path.join(os.path.expanduser("~"), 'pythainlp-data') + if not os.path.exists(path): + os.makedirs(path) + return path def file_trie(data): ''' ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie ''' - path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')#os.path.join(, 'pthainlp_trie.data') + path = get_path_pythainlp_data() if not os.path.exists(path): os.makedirs(path) if data=="newmm": @@ -65,4 +79,4 @@ def test_segmenter(segmenter, test): print(newmm) print("mm :") mm=test_segmenter(word_tokenize(text,engine='mm'),test) - print(mm) \ No newline at end of file + print(mm) diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index d59298313..f6550bda0 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -21,18 +21,15 @@ print("Error ! using 'pip install gensim numpy'") sys.exit(0) from pythainlp.tokenize import word_tokenize +from pythainlp.corpus import get_file +from pythainlp.corpus import download as download_data import os def download(): - path = os.path.join(os.path.expanduser("~"), 'pythainlp-data') - if not os.path.exists(path): - os.makedirs(path) - path = os.path.join(path, 'thai2vec.vec') - if not os.path.exists(path): - print("Download models...") - from urllib import request - request.urlretrieve("https://www.dropbox.com/s/upnbmiebkfma7oy/thai2vec.vec?dl=1",path) - print("OK.") + path = get_file('thai2vec') + if path==None: + download_data('thai2vec') + path = get_file('thai2vec') return path def get_model(): return KeyedVectors.load_word2vec_format(download(),binary=False) diff --git a/requirements-travis.txt b/requirements-travis.txt index aa6b70f3b..e373efe83 100644 --- a/requirements-travis.txt +++ b/requirements-travis.txt @@ -7,4 +7,6 @@ langdetect requests dill pytz -conllu \ No newline at end of file +conllu +tinydb +tqdm diff --git a/requirements.txt b/requirements.txt index 5bef60b65..4384810fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ nltk>=3.2.2 future>=0.16.0 six -marisa_trie +marisa_trie<=0.7.4 requests dill pytz -conllu \ No newline at end of file +conllu +tinydb +tqdm diff --git a/setup.py b/setup.py index b70e0c43a..44254da55 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,11 @@ # -*- coding: utf-8 -*- from setuptools import setup,find_packages import codecs -with codecs.open('README.rst','r',encoding='utf-8') as readme_file: +with codecs.open('README.md','r',encoding='utf-8') as readme_file: readme = readme_file.read() readme_file.close() -requirements = [ - 'nltk>=3.2.2', - 'future>=0.16.0', - 'six', - 'marisa_trie<=0.7.4', - 'requests', - 'dill', - 'pytz', - 'conllu' -] -test_requirements = [ - # TODO: put package test requirements here -] +with codecs.open('requirements.txt','r',encoding='utf-8') as f: + requirements = f.read().splitlines() setup( name='pythainlp',