Skip to content

Commit

Permalink
Merge pull request #129 from UnB-KnEDLe/dev
Browse files Browse the repository at this point in the history
Update Dowloader
  • Loading branch information
alvesisaque committed Jul 2, 2021
2 parents f87b249 + fcc3a52 commit 43163be
Show file tree
Hide file tree
Showing 8 changed files with 167 additions and 118 deletions.
1 change: 0 additions & 1 deletion dodfminer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
try:

from dodfminer.extract.polished import acts

from dodfminer.downloader.core import Downloader
from dodfminer.extract.polished.core import ActsExtractor
from dodfminer.extract.pure.core import ContentExtractor
Expand Down
153 changes: 47 additions & 106 deletions dodfminer/downloader/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import os
import tqdm
import requests
import urllib.parse

from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dodfminer.downloader.helper import check_date, get_downloads


MONTHS_STRING = ["", "01_Janeiro", "02_Fevereiro", "03_Março", "04_Abril",
"05_Maio", "06_Junho", "07_Julho", "08_Agosto",
Expand Down Expand Up @@ -57,11 +57,11 @@ def _string_to_date(self, date):
"""
if '/' in date:
date = datetime.strptime(date, '%m/%y').date()
date = datetime.strptime(date, '%m/%Y').date()
elif '-' in date:
date = datetime.strptime(date, '%m-%y').date()
date = datetime.strptime(date, '%m-%Y').date()
else:
msg = 'start_date or end_date must be in format mm/yy or mm-yy'
msg = 'start_date or end_date must be in format mm/yy or mm-yyyy'
raise Exception(msg)

return date
Expand Down Expand Up @@ -97,63 +97,6 @@ def _create_download_folder(self):
# import pdb; pdb.set_trace()
self._create_single_folder(self._download_path)

def _make_url(self, date):
"""Make the url to download the dodf.
Uses the date as parameter to download.
Args:
date (:obj:`datetime`): The date to download the DODF
Returns:
The complete url to the buriti website to download the DODF.
"""
url_string = "http://www.buriti.df.gov.br/ftp/default.asp?ano="
url_string += str(date.year)
url_string += "&mes=" + str(MONTHS_STRING[date.month])
url = urllib.parse.quote(url_string, safe=':/?=&')
url = url.replace('%C3%A7', '%E7') # Replace ç for %E7
print(url)

return url

def _make_href_url(self, href):
"""Preprocess the URL to be aceptable by the souplink.
Args:
href (str): The dodf url part.
Returns:
The desired url preprocessed.
"""
url = "http://www.buriti.df.gov.br/ftp/"
url += href
url = urllib.parse.quote(url, safe=':/?=&')
url = url.replace('%C2', '')
url = url.replace('3%8', '')
url = url.replace('%C3%A7', '%E7')
url = url.replace('%C3%A3', '%E3')

return url

def _make_download_url(self, href):
"""Make downloadable url.
Args:
href (str): The dodf part of url to be downloaded.
Returns:
The url of the dodf to download.
"""
url = "http://www.buriti.df.gov.br/ftp/"
download_url = url + href
download_url = urllib.parse.quote(download_url, safe=':/?=&')

return download_url

def _fail_request_message(self, url, error):
"""Log error messages in download.
Expand All @@ -167,26 +110,6 @@ def _fail_request_message(self, url, error):
"check if the url is online via browser: {}".format(url)
self._log(message)

def _get_soup_link(self, url):
"""Create the souplink to download the pdf.
Args:
url (str): The website url to download the DODF.
Returns:
An :obj:`BeautifulSoup` which html queries are made.
Raises:
RequestException: log error in download.
"""
headers = {'User-Agent': 'Chrome/71.0.3578.80'}
try:
response = requests.get(url, headers=headers)
return BeautifulSoup(response.text, "html.parser")
except requests.exceptions.RequestException as error:
self._fail_request_message(url, error)

def _file_exist(self, path):
"""Check if a file exists.
Expand Down Expand Up @@ -221,8 +144,11 @@ def _download_pdf(self, url, path):
"""
try:
response = requests.get(url)
response.raise_for_status()
except requests.exceptions.RequestException as error:
self._fail_request_message(url, error)
except requests.exceptions.HTTPError as error:
self._fail_request_message(url, error)
else:
pdf_file = Path(path)
pdf_file.write_bytes(response.content)
Expand All @@ -244,8 +170,7 @@ def _make_month_path(self, year, actual_date):
str(actual_date.year))
if year != actual_date.year:
self._create_single_folder(year_path)
month_path = os.path.join(year_path,
MONTHS_STRING[actual_date.month])
month_path = os.path.join(year_path,MONTHS_STRING[actual_date.month])

return month_path

Expand All @@ -272,43 +197,54 @@ def pull(self, start_date, end_date):
+ (end_date.month - start_date.month))
# Creates progress bar
self._prog_bar = tqdm.tqdm(total=months_amt)
# Creates the project folder structure
# # Creates the project folder structure
self._create_download_folder()
year = 0


for month in range(months_amt+1):
# Uses the function relative delta for in cases the date is the
# last of the month, increase month instead of adding more dates
actual_date = start_date + relativedelta(months=+month)
# Convert back to string to update progress bar
desc_bar = str(actual_date)
self._prog_bar.set_description("Date %s" % desc_bar)
# Create and return the path for the dodfs to be donwloaded
month_path = self._make_month_path(year, actual_date)
self._create_single_folder(month_path)
url = self._make_url(actual_date)
a_list = self._get_soup_link(url)
year = actual_date.year
for a in a_list.find_all('a', href=True):
a_url = self._make_href_url(a['href'])
download_page = self._get_soup_link(a_url)
self._log("a_URL " + a_url)
number_of_files = int(download_page.find_all('b')[1].text)
year_ = str(year)
month_ = MONTHS_STRING[actual_date.month]


if(check_date(year_,month_) == True):
self._create_single_folder(month_path)
else:
print(f"*** There are still no DODFs for that date: {actual_date.month}/{year_} ***")
continue

_links_for_each_dodf = get_downloads(year_,month_)

for dodf in _links_for_each_dodf:
dodf_name = dodf
links = _links_for_each_dodf[dodf]
dodf_path = month_path
if number_of_files > 1:
dodf_path = os.path.join(month_path, a.text)

if(len(links) > 1):
dodf_path = os.path.join(month_path, dodf_name)
self._create_single_folder(dodf_path)

x = 0
for l in links:
x+=1
download_link = l
if(len(links) == 1):
dodf_name_path = os.path.join(dodf_path, dodf_name)
else:
dodf_name_path = os.path.join(dodf_path, f'{dodf_name} {x}')

for a_href in download_page.find_all('a', href=True):
download_url = self._make_download_url(a_href['href'])
dodf_name_path = os.path.join(dodf_path, a_href.text)
if not self._file_exist(dodf_name_path):
self._log("Downloding "
+ os.path.basename(dodf_name_path))
self._download_pdf(download_url, dodf_name_path)
self._log("Downloding "+ os.path.basename(dodf_name_path))
self._download_pdf(download_link, dodf_name_path)
else:
self._log("Jumping to the next")

self._prog_bar.update(1)
self._prog_bar.update(1)

def _log(self, message):
"""Logs a message following the downloader pattern.
Expand All @@ -318,3 +254,8 @@ def _log(self, message):
"""
self._prog_bar.write("[DOWNLOADER] " + str(message))


if __name__ == '__main__':
downloader = Downloader(save_path='./')
downloader.pull(start_date="05/2021", end_date="06/2021")
79 changes: 79 additions & 0 deletions dodfminer/downloader/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import json
from _pytest.python_api import raises
import requests
import os


url_ = 'https://dodf.df.gov.br/listar?'
downl = 'https://dodf.df.gov.br/index/visualizar-arquivo/?pasta='


#req1 returns list with available months for the given year
def req1(year):
url1 = url_+f'dir={str(year)}'
return url1

#req2 returns list with available DODFs for the given month
def req2(url1, month):
if("_" in month):
url2 = url1 + '/' + month
req2 = requests.get(url2)
content2 = json.loads(req2.content)
dodfs = list(content2['data'].items())
return(url2, dodfs)
else:
raise ValueError("month parameter format is wrong")


#req3 returns all pdfs from the selected DODF
def req3(url2, dodf):
url3 = url2+"/"+dodf.replace(" ","%20")
req3 = requests.get(url3)
content3 = json.loads(req3.content)
pdfs = content3['data']
return(url3, pdfs)


#Check for DODFs on the selected date
def check_date(year, month):
if("_" in month):
url1 = url_+f'dir={year}/{month}'
req = requests.get(url1)
content = json.loads(req.content)
if('data' in content.keys()):
if(len(content["data"]) > 0):
return True
else:
return False
else:
return False
else:
raise ValueError("month parameter format is wrong")

#Generates download url
def get_downloads(year, month):
if("_" in month):
url1 = req1(year)
url2, dodfs = req2(url1,month)
_links = {}

#Lista de DODFS:
for dodf in dodfs:
url3, pdfs = req3(url2,dodf[1])
dodf_name = dodf[1]
_pdfs = []
for pdf in pdfs:
index1 = url3.find(year)
dir_ = (url3[index1:]).replace("/","|")+'|'+'&arquivo='
index2 = pdf.rfind("/")
arq_ = pdf[index2+1:].replace(" ","%20")
link_download = downl+dir_+arq_
_pdfs.append(link_download)
_links[dodf_name] = _pdfs

return(_links)
else:
raise ValueError("month parameter format is wrong")



2 changes: 1 addition & 1 deletion dodfminer/extract/polished/acts/aposentadoria.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def _props_names(self):
"Fundamento Legal", "Orgao", "Vigencia", "Matricula SIAPE"]

def _rule_for_inst(self):
start = r"(APOSENTAR|CONCEDER\sAPOSENTADORIA,?\s?)"
start = r"(APOSENTAR|CONCEDER,\sAPOSENTADORIA|CONCEDER\sAPOSENTADORIA,?\s?)"
body = r"([\s\S]*?"
end = r"(?<!lei)\s(?:[0-9|\s]*?[.|-]\s?)+?"
end2 = r"[0-9|\s]*/\s?[0-9|\s]*-?\s?[0-9|\s]*[.|,])"
Expand Down
8 changes: 4 additions & 4 deletions tests/support/dodf_pdfs/cessoes.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
,nome,matricula,cargo_efetivo,classe,padrao,orgao_cedente,orgao_cessionario,onus,fundamento legal,processo_SEI,vigencia,matricula_SIAPE,cargo_orgao_cessionario,simbolo,hierarquia_lotacao
0,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
1,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
0,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
1,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
2,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
3,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
de origem, conforme Decisao da Diretoria Executiva, exarada pela Sessao no 4.",,00112-00037276/2019-21.,,, usando das atribuicoes conferidas pelo Art. 25,,
2,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
3,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
10 changes: 5 additions & 5 deletions tests/support/dodf_pdfs/sem_efeito_aposentadoria.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
,tipo_ato,tipo_documento,numero_documento,data_documento,numero_dodf,data_dodf,pagina_dodf,nome,matricula,matricula_SIAPE,cargo_efetivo,classe,padrao,quadro,orgao,processo_SEI,tipo_edicao
0,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
0,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
1,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
2,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
3,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
dezembro de 2017", 39.,"MARCO
ANTONIO CATTANI FRANCA","129661-2,",, 129.661-2,,,,,271.000.680/2017,normal
1,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
2,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
3,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
30 changes: 30 additions & 0 deletions tests/test_donwloader_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest
from dodfminer.downloader.helper import req1, req2, req3, get_downloads, check_date


def test_req1():
assert req1(2017) == 'https://dodf.df.gov.br/listar?dir=2017'

def test_req2():
assert type(req2('https://dodf.df.gov.br/listar?dir=2017', '01_Janeiro')) == tuple

def test_req2_2():
with pytest.raises(ValueError):
assert (req2('https://dodf.df.gov.br/listar?dir=2017', 'Janeiro'))

def test_req3():
assert type(req3('https://dodf.df.gov.br/listar?dir=2017', 'DODF 023 31-01-2006')) == tuple

def test_get_downloads():
assert type(get_downloads('2017', '01_Janeiro')) == dict

def test_get_downloads_2():
with pytest.raises(ValueError):
assert (get_downloads('2017', 'Janeiro'))

def test_check_date():
assert type(check_date('2017', '01_Janeiro')) == bool

def test_check_date_2():
with pytest.raises(ValueError):
assert (check_date('2017', 'Janeiro'))
2 changes: 1 addition & 1 deletion tests/test_extract_polished_acts.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_retirement_prop_names(act_ret):
assert act_ret._props_names() == ["Tipo do Ato", "SEI", "Nome", "Matrícula", "Tipo de Aposentadoria", "Cargo", "Classe", "Padrao", "Quadro", "Fundamento Legal", "Orgao", "Vigencia", "Matricula SIAPE"]

def test_retirement_rule(act_ret):
assert act_ret._rule_for_inst() == r"(APOSENTAR|CONCEDER\sAPOSENTADORIA,?\s?)([\s\S]*?(?<!lei)\s(?:[0-9|\s]*?[.|-]\s?)+?[0-9|\s]*/\s?[0-9|\s]*-?\s?[0-9|\s]*[.|,])"
assert act_ret._rule_for_inst() == r"(APOSENTAR|CONCEDER,\sAPOSENTADORIA|CONCEDER\sAPOSENTADORIA,?\s?)([\s\S]*?(?<!lei)\s(?:[0-9|\s]*?[.|-]\s?)+?[0-9|\s]*/\s?[0-9|\s]*-?\s?[0-9|\s]*[.|,])"

def test_retirement_prop_rules_names(act_ret):
assert list(act_ret._prop_rules()) == ["processo_SEI", "nome", "matricula", "tipo_ret", "cargo_efetivo", "classe", "padrao", "quadro", "fundamento_legal", "orgao", "vigencia", "matricula_SIAPE"]
Expand Down

0 comments on commit 43163be

Please sign in to comment.