Skip to content

Commit

Permalink
pypdf2 parser: extract date, num, seccion and provincia
Browse files Browse the repository at this point in the history
  • Loading branch information
PabloCastellano committed Jun 18, 2015
1 parent 96a78dd commit 831a80c
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 14 deletions.
18 changes: 15 additions & 3 deletions bormeparser/backends/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import locale
import os

from bormeparser.borme import Borme, BormeActo
import datetime

# TODO: What if the system hasn't generated this locale?
# locale -a
locale.setlocale(locale.LC_TIME, 'es_ES.utf8')


class BormeParserBackend(object):
def __init__(self, filename):
Expand All @@ -17,11 +24,16 @@ def parse(self):
actos = self._parse()
bormeactos = []
for id_acto in actos.keys():
if not isinstance(id_acto, int):
continue
data = actos[id_acto]
a = BormeActo(id_acto, data['Empresa'], data['Actos'])
bormeactos.append(a)
# FIXME
return Borme(datetime.date(1970, 1, 1), 'DUMMY', 'DUMMY', bormeactos)

fecha = datetime.datetime.strptime(actos['borme_fecha'], '%A %d de %B de %Y')
fecha = datetime.date(fecha.year, fecha.month, fecha.day)
# FIXME: provincia, seccion objects
return Borme(fecha, actos['borme_seccion'], actos['borme_provincia'], actos['borme_num'], bormeactos)

def _parse(self):
"""
Expand Down
57 changes: 50 additions & 7 deletions bormeparser/backends/pypdf2/functions.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
from PyPDF2 import PdfFileReader
from bormeparser.regex import regex_cargos, REGEX_EMPRESA, REGEX_TEXT

from bormeparser.regex import regex_cargos, REGEX_EMPRESA, REGEX_TEXT, REGEX_BORME_NUM
from bormeparser.acto import ACTO

ACTOS = {}
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)
logger.setLevel(logging.WARN)

DATA = {'borme_fecha': None, 'borme_num': None, 'borme_seccion': None, 'borme_provincia': None}


def clean_data(data):
Expand All @@ -20,11 +26,15 @@ def parse_content(content):
nombreacto = None
acto_id = None
empresa = None
fecha = False
numero = False
seccion = False
provincia = False

# Python 3
if isinstance(content, bytes):
content = content.decode('unicode_escape')
#print(content)
logger.debug(content)

for line in content.split('\n'):
if line.startswith('/Cabecera_acto'):
Expand All @@ -38,6 +48,26 @@ def parse_content(content):
data = ""
continue

if line.startswith('/Fecha'):
if not DATA['borme_fecha']:
fecha = True
continue

if line.startswith('/Numero_BORME'):
if not DATA['borme_num']:
numero = True
continue

if line.startswith('/Seccion'):
if not DATA['borme_seccion']:
seccion = True
continue

if line.startswith('/Provincia'):
if not DATA['borme_provincia']:
provincia = True
continue

if line == 'BT':
# Begin text object
continue
Expand All @@ -54,10 +84,10 @@ def parse_content(content):
texto = False
data = clean_data(data)
actos[nombreacto] = data
ACTOS[acto_id] = {'Empresa': empresa, 'Actos': actos}
DATA[acto_id] = {'Empresa': empresa, 'Actos': actos}
continue

if not texto and not cabecera:
if True not in (texto, cabecera, fecha, numero, seccion, provincia):
continue

if line == '/F1 8 Tf':
Expand Down Expand Up @@ -88,7 +118,20 @@ def parse_content(content):

m = REGEX_TEXT.match(line)
if m:
#print(m.group(1))
if fecha:
DATA['borme_fecha'] = m.group(1)
fecha = False
if numero:
text = m.group(1)
DATA['borme_num'] = int(REGEX_BORME_NUM.match(text).group(1))
numero = False
if seccion:
DATA['borme_seccion'] = m.group(1)
seccion = False
if provincia:
DATA['borme_provincia'] = m.group(1)
provincia = False
logger.debug(m.group(1))
data += ' ' + m.group(1)


Expand All @@ -97,4 +140,4 @@ def parse_file(filename):
for n in range(0, reader.getNumPages()):
content = reader.getPage(n).getContents().getData()
parse_content(content)
return ACTOS
return DATA
5 changes: 4 additions & 1 deletion bormeparser/backends/pypdf2/parser.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bormeparser.backends.base import BormeParserBackend
#from __future__ import absolute_import
#from functions import parse_file
from .functions import parse_file
from bormeparser.backends.base import BormeParserBackend



class PyPDF2Parser(BormeParserBackend):
Expand Down
3 changes: 2 additions & 1 deletion bormeparser/borme.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ class BormeXML(object):
# TODO: Create instance directly from filename
class Borme(object):

def __init__(self, date, seccion, provincia, actos=None, url=None, filename=None):
def __init__(self, date, seccion, provincia, num, actos=None, url=None, filename=None):
if isinstance(date, tuple):
date = datetime.date(year=date[0], month=date[1], day=date[2])
self.date = date
self.seccion = seccion
self.provincia = provincia
self.num = num
self.actos = actos
self.url = url
self.filename = filename
Expand Down
9 changes: 8 additions & 1 deletion bormeparser/provincia.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,11 @@ class PROVINCIA:
ZAMORA = '49'
ZARAGOZA = '50'
CEUTA = '51'
MELILLA = '52'
MELILLA = '52'

"""
TODO:
@staticmethod
def from_string(string):
return PROVINCIA._keywords[string]
"""
2 changes: 1 addition & 1 deletion bormeparser/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

REGEX_EMPRESA = re.compile('^(\d+)\s+-\s+(.*)$')
REGEX_TEXT = re.compile('^\((.*)\)Tj$')

REGEX_BORME_NUM = re.compile('^Núm\. (\d+)')

def regex_cargos(data):
"""
Expand Down

0 comments on commit 831a80c

Please sign in to comment.