Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,78 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
from pythainlp.tools import get_path_db,get_path_data
from tinydb import TinyDB,Query
from future.moves.urllib.request import urlopen
from tqdm import tqdm
import requests
import os
import math
import requests
from nltk.corpus import names
#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
path_db_=get_path_db()
def get_file(name):
db=TinyDB(path_db_)
temp = Query()
if len(db.search(temp.name==name))>0:
path= get_path_data(db.search(temp.name==name)[0]['file'])
db.close()
if not os.path.exists(path):
download(name)
return path
def download_(url, dst):
"""
@param: url to download file
@param: dst place to put the file
"""
file_size = int(urlopen(url).info().get('Content-Length', -1))
if os.path.exists(dst):
first_byte = os.path.getsize(dst)
else:
first_byte = 0
if first_byte >= file_size:
return file_size
header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
pbar = tqdm(
total=file_size, initial=first_byte,
unit='B', unit_scale=True, desc=url.split('/')[-1])
req = requests.get(url, headers=header, stream=True)
with(open(get_path_data(dst), 'wb')) as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
#return file_size
def download(name,force=False):
db=TinyDB(path_db_)
temp = Query()
data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json")
data_json=data.json()
if name in list(data_json.keys()):
temp_name=data_json[name]
print("Download : "+name)
if len(db.search(temp.name==name))==0:
print(name+" "+temp_name['version'])
download_(temp_name['download'],temp_name['file_name'])
db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']})
else:
if len(db.search(temp.name==name and temp.version==temp_name['version']))==0:
print("have update")
print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
yes_no="y"
if force==False:
yes_no=str(input("y or n : ")).lower()
if "y"==yes_no:
download_(temp_name['download'],temp_name['file_name'])
db.update({'version':temp_name['version']},temp.name==name)
else:
print("re-download")
print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
yes_no="y"
if force==False:
yes_no=str(input("y or n : ")).lower()
if "y"==yes_no:
download_(temp_name['download'],temp_name['file_name'])
db.update({'version':temp_name['version']},temp.name==name)
db.close()
18 changes: 16 additions & 2 deletions pythainlp/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,25 @@
import dill
from pythainlp.tokenize import tcc
import marisa_trie
def get_path_db():
path = os.path.join(get_path_pythainlp_data(), "db.json")
if not os.path.exists(path):
from tinydb import TinyDB
db=TinyDB(path)
#db.insert({'name': 'hi', 'version': '0.1','file':''})
return path
def get_path_data(filename):
return os.path.join(get_path_pythainlp_data(), filename)
def get_path_pythainlp_data():
path= os.path.join(os.path.expanduser("~"), 'pythainlp-data')
if not os.path.exists(path):
os.makedirs(path)
return path
def file_trie(data):
'''
ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie
'''
path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')#os.path.join(, 'pthainlp_trie.data')
path = get_path_pythainlp_data()
if not os.path.exists(path):
os.makedirs(path)
if data=="newmm":
Expand Down Expand Up @@ -65,4 +79,4 @@ def test_segmenter(segmenter, test):
print(newmm)
print("mm :")
mm=test_segmenter(word_tokenize(text,engine='mm'),test)
print(mm)
print(mm)
15 changes: 6 additions & 9 deletions pythainlp/word_vector/thai2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,15 @@
print("Error ! using 'pip install gensim numpy'")
sys.exit(0)
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import get_file
from pythainlp.corpus import download as download_data
import os

def download():
path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
if not os.path.exists(path):
os.makedirs(path)
path = os.path.join(path, 'thai2vec.vec')
if not os.path.exists(path):
print("Download models...")
from urllib import request
request.urlretrieve("https://www.dropbox.com/s/upnbmiebkfma7oy/thai2vec.vec?dl=1",path)
print("OK.")
path = get_file('thai2vec')
if path==None:
download_data('thai2vec')
path = get_file('thai2vec')
return path
def get_model():
return KeyedVectors.load_word2vec_format(download(),binary=False)
Expand Down
4 changes: 3 additions & 1 deletion requirements-travis.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ langdetect
requests
dill
pytz
conllu
conllu
tinydb
tqdm
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
nltk>=3.2.2
future>=0.16.0
six
marisa_trie
marisa_trie<=0.7.4
requests
dill
pytz
conllu
conllu
tinydb
tqdm
17 changes: 3 additions & 14 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
# -*- coding: utf-8 -*-
from setuptools import setup,find_packages
import codecs
with codecs.open('README.rst','r',encoding='utf-8') as readme_file:
with codecs.open('README.md','r',encoding='utf-8') as readme_file:
readme = readme_file.read()
readme_file.close()
requirements = [
'nltk>=3.2.2',
'future>=0.16.0',
'six',
'marisa_trie<=0.7.4',
'requests',
'dill',
'pytz',
'conllu'
]
test_requirements = [
# TODO: put package test requirements here
]
with codecs.open('requirements.txt','r',encoding='utf-8') as f:
requirements = f.read().splitlines()

setup(
name='pythainlp',
Expand Down