Skip to content

Commit

Permalink
Merge pull request #9 from PabloCastellano/use_requests
Browse files Browse the repository at this point in the history
Use requests
  • Loading branch information
PabloCastellano committed Mar 11, 2017
2 parents a87349b + d7fccaf commit 721d16e
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Version 0.3.0 [unreleased]
--------------------------

- Cambios en el formato BORME-JSON
- Usar requests en lugar de urllib


Version 0.2.4 [2016-09-21]
Expand Down
16 changes: 5 additions & 11 deletions bormeparser/borme.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,11 @@
import json
import os
import re
import requests
import six

from lxml import etree

try:
# Python 3
FileNotFoundError
from urllib import request
except NameError:
# Python 2
FileNotFoundError = IOError
import urllib as request

logger = logging.getLogger(__name__)
ch = logging.StreamHandler()
logger.addHandler(ch)
Expand Down Expand Up @@ -186,7 +178,9 @@ def parse_date(fecha):
return datetime.datetime.strptime(fecha, '%d/%m/%Y').date()

if source.startswith('http'):
self.xml = etree.parse(request.urlopen(source))
req = requests.get(source)
content = req.text.encode("ISO-8859-1")
self.xml = etree.fromstring(content).getroottree()
else:
self.xml = etree.parse(source)

Expand Down Expand Up @@ -218,7 +212,7 @@ def from_file(path, secure=USE_HTTPS):

if not path.startswith('http'):
if not os.path.exists(path):
raise FileNotFoundError(path)
raise IOError(path)
bxml.filename = path

bxml._load(path)
Expand Down
40 changes: 26 additions & 14 deletions bormeparser/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,22 @@

import datetime
import os
import requests
import time
from lxml import etree
from threading import Thread

from .exceptions import BormeDoesntExistException
from .parser import parse as parse_borme
from .seccion import SECCION

requests.adapters.DEFAULT_RETRIES = 3

try:
# Python 3
from queue import Queue
from urllib import request
except ImportError:
from Queue import Queue
import urllib as request

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -140,8 +142,10 @@ def get_url_borme_c(date, some_number, format='xml'):

def get_nbo_from_xml(source):
""" Número de Boletín Oficial """
if source.startswith('https'):
tree = etree.parse(request.urlopen(source))
if source.startswith('http'):
req = requests.get(source)
content = req.text.encode('iso-8859-1')
tree = etree.fromstring(content).getroottree()
else:
tree = etree.parse(source)

Expand All @@ -154,12 +158,13 @@ def get_nbo_from_xml(source):
def get_url_pdfs_provincia(date, provincia, secure=USE_HTTPS):
""" Obtiene las URLs para descargar los BORMEs de la provincia y fecha indicada """
url = get_url_xml(date, secure=secure)
req = requests.get(url)
if secure:
tree = etree.parse(request.urlopen(url))
protocol = 'https'
else:
tree = etree.parse(url)
protocol = 'http'
content = req.text.encode('iso-8859-1')
tree = etree.fromstring(content).getroottree()

if tree.getroot().tag != 'sumario':
raise BormeDoesntExistException
Expand Down Expand Up @@ -194,12 +199,13 @@ def get_url_pdfs_seccion(date, seccion, secure=USE_HTTPS):
raise ValueError('Section must be: A or B')

url = get_url_xml(date, secure=secure)
req = requests.get(url)
if secure:
tree = etree.parse(request.urlopen(url))
protocol = 'https'
else:
tree = etree.parse(url)
protocol = 'http'
content = req.text.encode('iso-8859-1')
tree = etree.fromstring(content).getroottree()

if tree.getroot().tag != 'sumario':
raise BormeDoesntExistException
Expand Down Expand Up @@ -241,12 +247,13 @@ def get_url_seccion_c(date, format='xml', secure=USE_HTTPS):
raise ValueError('format must be "xml", "htm" or "pdf"')

url = get_url_xml(date, secure=secure)
req = requests.get(url)
if secure:
tree = etree.parse(request.urlopen(url))
protocol = 'https'
else:
tree = etree.parse(url)
protocol = 'http'
content = req.text.encode('iso-8859-1')
tree = etree.fromstring(content).getroottree()

if tree.getroot().tag != 'sumario':
raise BormeDoesntExistException
Expand Down Expand Up @@ -289,18 +296,22 @@ def get_url_xml(date, secure=USE_HTTPS):
return BORME_XML_URL.format(protocol=protocol, year=date.year, month=date.month, day=date.day)


# TODO: FileExistsError (subclass de OSError)
def download_url(url, filename=None):
logger.debug('Downloading URL: %s' % url)
if os.path.exists(filename):
logger.warning('%s already exists!' % os.path.basename(filename))
return False

local_filename, headers = request.urlretrieve(url, filename)
content_length = headers['content-length']
req = requests.get(url, stream=True)
with open(filename, "wb") as fp:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
fp.write(chunk)

content_length = req.headers['content-length']
logger.debug("%.2f KB" % (int(content_length) / 1024.0))

return True, local_filename
return True


def download_urls(urls, path):
Expand Down Expand Up @@ -381,6 +392,7 @@ def __init__(self, thread_id, queue, files):
def run(self):
while True:
url, full_path = self.queue.get()
time.sleep(0.1)
downloaded = download_url(url, full_path)

if downloaded:
Expand Down
14 changes: 3 additions & 11 deletions bormeparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,6 @@
import importlib
import os

try:
# Python 3
from urllib import request
FileNotFoundError
except ImportError:
# Python 2
import urllib as request
FileNotFoundError = IOError

# backends
DEFAULT_PARSER = {'A': ('bormeparser.backends.pypdf2.parser', 'PyPDF2Parser'),
'C': ('bormeparser.backends.seccion_c.lxml.parser', 'LxmlBormeCParser')}
Expand All @@ -43,9 +34,10 @@ def parse(data, seccion):
borme = parser(data).parse()
elif data.startswith('http'):
# TODO
#content = request.urlopen(data).read()
# req = requests.get(data)
# content = req.text
borme = parser(data).parse()
else:
raise FileNotFoundError(data)
raise IOError(data)

return borme
8 changes: 1 addition & 7 deletions bormeparser/tests/test_borme.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@
from bormeparser.seccion import SECCION
from bormeparser.provincia import PROVINCIA

try:
FileNotFoundError
except NameError:
# Python 2
FileNotFoundError = IOError

EXAMPLES_PATH = os.path.join(os.path.dirname(bormeparser.__file__), '..', 'examples')

DATA1 = {214: {'Actos': [{'Ceses/Dimisiones': {'Adm. Unico': {'JUAN GARCIA GARCIA'}}},
Expand Down Expand Up @@ -231,7 +225,7 @@ def test_from_file(self):
self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1))

# Exceptions
self.assertRaises(FileNotFoundError, BormeXML.from_file, 'invalidfile.xml')
self.assertRaises(IOError, BormeXML.from_file, 'invalidfile.xml')

def test_from_date(self):
bxml = BormeXML.from_date(self.date)
Expand Down
1 change: 1 addition & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
lxml==3.6.4
PyPDF2==1.26.0
requests==2.7.0
six==1.10.0
wheel==0.29.0
7 changes: 3 additions & 4 deletions scripts/borme_poller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@

# El BORME se publica los días laborables y normalmente a las 7:30 de la mañana
import datetime
import requests
import time

from urllib import request

URL_BASE = 'https://boe.es/diario_borme/xml.php?id=BORME-S-' # https://boe.es/diario_borme/xml.php?id=BORME-S-20150910
DELAY = 5 * 60 # 5 minutes
LOGFILE = 'xmlpoller.log'
Expand Down Expand Up @@ -94,8 +93,8 @@ def poll_xml_dl():
if weekday in (5, 6):
wait_till_monday(weekday)
url = URL_BASE + today.strftime('%Y%m%d')
content = request.urlopen(url, timeout=TIMEOUT).read()
found = parse_content(content)
req = requests.get(url, timeout=TIMEOUT)
found = parse_content(req.text)
if found:
wait_till_seven()
else:
Expand Down
2 changes: 1 addition & 1 deletion scripts/check_bormes.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def check_range(begin, end, provincia, seccion, directory, download_xml):
xml_path = get_borme_xml_filepath(next_date, directory)
try:
bxml = BormeXML.from_file(xml_path)
except FileNotFoundError:
except IOError:
if download_xml:
logger.info('Downloading {}'.format(os.path.basename(xml_path)))
bxml = BormeXML.from_date(next_date)
Expand Down
2 changes: 1 addition & 1 deletion scripts/download_borme_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def download_range(begin, end, directory, seccion, provincia=None):
pass
bxml.save_to_file(xml_path)

except FileNotFoundError:
except IOError:
logger.debug('Downloading {filename}'.format(filename=os.path.basename(xml_path)))
bxml = BormeXML.from_date(next_date)
try:
Expand Down

0 comments on commit 721d16e

Please sign in to comment.