Skip to content

Commit

Permalink
Merge c83de99 into de50ef8
Browse files Browse the repository at this point in the history
  • Loading branch information
Khalil09 committed Mar 19, 2021
2 parents de50ef8 + c83de99 commit 36557c5
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 17 deletions.
49 changes: 32 additions & 17 deletions dodfminer/downloader/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,16 @@ def _make_url(self, date):
The complete url to the buriti website to download the DODF.
"""
url_string = "http://www.buriti.df.gov.br/ftp/default.asp?ano="
url_string += str(date.year)
url_string += "&mes=" + str(MONTHS_STRING[date.month])
url_string = f"http://www.buriti.df.gov.br/ftp/novo_portal_gdf/novo_dodf.asp?Ano={str(date.year)}&Mes={str(MONTHS_STRING[date.month])}&dir="
# url_string = "http://www.buriti.df.gov.br/ftp/default.asp?ano="
# url_string += str(date.year)
# url_string += "&mes=" + str(MONTHS_STRING[date.month])
url = urllib.parse.quote(url_string, safe=':/?=&')
url = url.replace('%C3%A7', '%E7') # Replace ç for %E7
print(url)

return url

def _make_href_url(self, href):
def _make_href_url(self, url, href):
"""Preprocess the URL to be aceptable by the souplink.
Args:
Expand All @@ -128,13 +128,13 @@ def _make_href_url(self, href):
The desired url preprocessed.
"""
url = "http://www.buriti.df.gov.br/ftp/"
url += href
url = urllib.parse.quote(url, safe=':/?=&')
url = url.replace('%C2', '')
url = url.replace('3%8', '')
url = url.replace('%C3%A7', '%E7')
url = url.replace('%C3%A3', '%E3')
print(url)

return url

Expand Down Expand Up @@ -224,7 +224,7 @@ def _download_pdf(self, url, path):
except requests.exceptions.RequestException as error:
self._fail_request_message(url, error)
else:
pdf_file = Path(path)
pdf_file = Path(path + ".pdf")
pdf_file.write_bytes(response.content)
self._log("Finished " + os.path.basename(path))

Expand Down Expand Up @@ -286,28 +286,43 @@ def pull(self, start_date, end_date):
month_path = self._make_month_path(year, actual_date)
self._create_single_folder(month_path)
url = self._make_url(actual_date)
a_list = self._get_soup_link(url)
soup_obj = self._get_soup_link(url)
select = soup_obj.find('select', attrs={
'class': 'chzn-select', 'data-placeholder': 'Selecione o Diário...'})
dodfs_list = select.find_all('option')

year = actual_date.year
for a in a_list.find_all('a', href=True):
a_url = self._make_href_url(a['href'])
download_page = self._get_soup_link(a_url)
self._log("a_URL " + a_url)
number_of_files = int(download_page.find_all('b')[1].text)
for dodf in dodfs_list:
dodf_url = self._make_href_url(url, dodf.text)
download_page = self._get_soup_link(dodf_url)
self._log("a_URL " + dodf_url)
find_links = download_page.find_all('a')
number_of_files = len(find_links)
dodf_path = month_path
if number_of_files > 1:
dodf_path = os.path.join(month_path, a.text)
dodf_path = os.path.join(month_path, dodf.text)
self._create_single_folder(dodf_path)

for a_href in download_page.find_all('a', href=True):
download_url = self._make_download_url(a_href['href'])
dodf_name_path = os.path.join(dodf_path, a_href.text)
for link in find_links:
download_url = self._make_download_url(link.get('href')[3:])
dodf_name_path = os.path.join(dodf_path, link.text)
if not self._file_exist(dodf_name_path):
self._log("Downloding "
+ os.path.basename(dodf_name_path))
self._download_pdf(download_url, dodf_name_path)
else:
self._log("Jumping to the next")

# for a_href in download_page.find_all('a', href=True):
# download_url = self._make_download_url(a_href['href'])
# dodf_name_path = os.path.join(dodf_path, a_href.text)
# if not self._file_exist(dodf_name_path):
# self._log("Downloding "
# + os.path.basename(dodf_name_path))
# self._download_pdf(download_url, dodf_name_path)
# else:
# self._log("Jumping to the next")

self._prog_bar.update(1)

def _log(self, message):
Expand Down
74 changes: 74 additions & 0 deletions vitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from bs4 import BeautifulSoup
import os # formataçao para OS em caminhos de arquivos
import requests


def download(url, endereco):
resposta = requests.get(url) # retorna binarios que compoe o arq

try:
with open(endereco, 'wb') as novo_arquivo: # wb: escrita e binario
novo_arquivo.write(resposta.content)
print('Download salvo em {}'.format(endereco))
except:
print('Ocorreu um Erro')


MONTHS_STRING = ["", "01_Janeiro", "02_Fevereiro", "03_Mar%E7o", "04_Abril",
"05_Maio", "06_Junho", "07_Julho", "08_Agosto",
"09_Setembro", "10_Outubro", "11_Novembro", "12_Dezembro"]

ano = int(input('Escolha um ano:'))
mes = int(input('Escolha um mes:'))
mes_format = ""

for y in range(0, 13):
if(mes == y):
mes_format = MONTHS_STRING[y]


# Base URL
url_ = "http://www.buriti.df.gov.br/ftp/novo_portal_gdf/novo_dodf.asp?Ano={}&Mes={}&dir=".format(
ano, mes_format)


r = requests.get(url_)
soup = BeautifulSoup(r.content, 'html.parser')

# Find options
select = soup.find('select', attrs={
'class': 'chzn-select', 'data-placeholder': 'Selecione o Diário...'})
options = select.find_all('option')


# DODF choice
for c in range(1, len(options)):
print('Opção {} : {}'.format(c, options[c].text))
opcao = int(input('Escolha uma opção:'))
txt = options[opcao].text
txt_cadeia = txt.split()

# Complete url formation
for d in range(0, len(txt_cadeia)):
if(d == len(txt_cadeia)-1):
url_ = url_+txt_cadeia[d]
else:
url_ = url_+txt_cadeia[d]+'+'
url_link = url_

rr = requests.get(url_link)
soup2 = BeautifulSoup(rr.content, 'html.parser')

find_links = soup2.find_all('a')
links = []

base = "http://www.buriti.df.gov.br/ftp"
for link in find_links:
links.append(base+link.get('href')[2:].replace(' ', '%20'))
print(link.get('href'))

count = 0
for url in links:
output = os.path.join('files/', 'file{}.pdf'.format(count))
download(url, output)
count += 1

0 comments on commit 36557c5

Please sign in to comment.