PyThaiNLP · wannaphong · Jun 23, 2018 · Jun 23, 2018 · Jun 23, 2018 · Jun 23, 2018
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -1,4 +1,78 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import,unicode_literals
+from pythainlp.tools import get_path_db,get_path_data
+from tinydb import TinyDB,Query
+from future.moves.urllib.request import urlopen
+from tqdm import tqdm
+import requests
+import os
+import math
+import requests
 from nltk.corpus import names
-#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
+#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
+path_db_=get_path_db()
+def get_file(name):
+    db=TinyDB(path_db_)
+    temp = Query()
+    if len(db.search(temp.name==name))>0:
+        path= get_path_data(db.search(temp.name==name)[0]['file'])
+        db.close()
+        if not os.path.exists(path):
+            download(name)
+        return path
+def download_(url, dst):
+    """
+    @param: url to download file
+    @param: dst place to put the file
+    """
+    file_size = int(urlopen(url).info().get('Content-Length', -1))
+    if os.path.exists(dst):
+        first_byte = os.path.getsize(dst)
+    else:
+        first_byte = 0
+    if first_byte >= file_size:
+        return file_size
+    header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
+    pbar = tqdm(
+        total=file_size, initial=first_byte,
+        unit='B', unit_scale=True, desc=url.split('/')[-1])
+    req = requests.get(url, headers=header, stream=True)
+    with(open(get_path_data(dst), 'wb')) as f:
+        for chunk in req.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    #return file_size
+def download(name,force=False):
+    db=TinyDB(path_db_)
+    temp = Query()
+    data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json")
+    data_json=data.json()
+    if name in list(data_json.keys()):
+        temp_name=data_json[name]
+        print("Download : "+name)
+        if len(db.search(temp.name==name))==0:
+            print(name+" "+temp_name['version'])
+            download_(temp_name['download'],temp_name['file_name'])
+            db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']})
+        else:
+            if len(db.search(temp.name==name and temp.version==temp_name['version']))==0:
+                print("have update")
+                print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
+                yes_no="y"
+                if force==False:
+                    yes_no=str(input("y or n : ")).lower()
+                if "y"==yes_no:
+                    download_(temp_name['download'],temp_name['file_name'])
+                    db.update({'version':temp_name['version']},temp.name==name)
+            else:
+                print("re-download")
+                print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
+                yes_no="y"
+                if force==False:
+                    yes_no=str(input("y or n : ")).lower()
+                if "y"==yes_no:
+                    download_(temp_name['download'],temp_name['file_name'])
+                    db.update({'version':temp_name['version']},temp.name==name)
+    db.close()
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -4,11 +4,25 @@
 import dill
 from pythainlp.tokenize import tcc
 import marisa_trie
+def get_path_db():
+	path = os.path.join(get_path_pythainlp_data(), "db.json")
+	if not os.path.exists(path):
+		from tinydb import TinyDB
+		db=TinyDB(path)
+		#db.insert({'name': 'hi', 'version': '0.1','file':''})
+	return path
+def get_path_data(filename):
+	return os.path.join(get_path_pythainlp_data(), filename)
+def get_path_pythainlp_data():
+	path= os.path.join(os.path.expanduser("~"), 'pythainlp-data')
+	if not os.path.exists(path):
+		os.makedirs(path)
+	return path
 def file_trie(data):
 	'''
 	ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie
 	'''
-	path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')#os.path.join(, 'pthainlp_trie.data')
+	path = get_path_pythainlp_data()
 	if not os.path.exists(path):
 		os.makedirs(path)
 	if data=="newmm":
@@ -65,4 +79,4 @@ def test_segmenter(segmenter, test):
     print(newmm)
     print("mm :")
     mm=test_segmenter(word_tokenize(text,engine='mm'),test)
-    print(mm)
+    print(mm)
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
@@ -21,18 +21,15 @@
 		print("Error ! using 'pip install gensim numpy'")
 		sys.exit(0)
 from pythainlp.tokenize import word_tokenize
+from pythainlp.corpus import get_file
+from pythainlp.corpus import download as download_data
 import os
 
 def download():
-	path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
-	if not os.path.exists(path):
-		os.makedirs(path)
-	path = os.path.join(path, 'thai2vec.vec')
-	if not os.path.exists(path):
-		print("Download models...")
-		from urllib import request
-		request.urlretrieve("https://www.dropbox.com/s/upnbmiebkfma7oy/thai2vec.vec?dl=1",path)
-		print("OK.")
+	path = get_file('thai2vec')
+	if path==None:
+		download_data('thai2vec')
+		path = get_file('thai2vec')
 	return path
 def get_model():
 	return KeyedVectors.load_word2vec_format(download(),binary=False)

diff --git a/requirements-travis.txt b/requirements-travis.txt
@@ -7,4 +7,6 @@ langdetect
 requests
 dill
 pytz
-conllu
+conllu
+tinydb
+tqdm
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,10 @@
 nltk>=3.2.2
 future>=0.16.0
 six
-marisa_trie
+marisa_trie<=0.7.4
 requests
 dill
 pytz
-conllu
+conllu
+tinydb
+tqdm
diff --git a/setup.py b/setup.py
@@ -1,22 +1,11 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup,find_packages
 import codecs
-with codecs.open('README.rst','r',encoding='utf-8') as readme_file:
+with codecs.open('README.md','r',encoding='utf-8') as readme_file:
     readme = readme_file.read()
 readme_file.close()
-requirements = [
-	'nltk>=3.2.2',
-	'future>=0.16.0',
-	'six',
-	'marisa_trie<=0.7.4',
-	'requests',
-	'dill',
-	'pytz',
-	'conllu'
-]
-test_requirements = [
-    # TODO: put package test requirements here
-]
+with codecs.open('requirements.txt','r',encoding='utf-8') as f:
+    requirements = f.read().splitlines()
 
 setup(
     name='pythainlp',
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,6 @@ langdetect @@
     requests
     dill
     pytz
-    conllu
+    conllu
+    tinydb
+    tqdm