Merge c83de99 into de50ef8

UnB-KnEDLe · Mar 19, 2021 · 36557c5 · 36557c5
2 parents de50ef8 + c83de99
commit 36557c5
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 17 deletions.
diff --git a/dodfminer/downloader/core.py b/dodfminer/downloader/core.py
@@ -109,16 +109,16 @@ def _make_url(self, date):
             The complete url to the buriti website to download the DODF.
 
         """
-        url_string = "http://www.buriti.df.gov.br/ftp/default.asp?ano="
-        url_string += str(date.year)
-        url_string += "&mes=" + str(MONTHS_STRING[date.month])
+        url_string = f"http://www.buriti.df.gov.br/ftp/novo_portal_gdf/novo_dodf.asp?Ano={str(date.year)}&Mes={str(MONTHS_STRING[date.month])}&dir="
+        # url_string = "http://www.buriti.df.gov.br/ftp/default.asp?ano="
+        # url_string += str(date.year)
+        # url_string += "&mes=" + str(MONTHS_STRING[date.month])
         url = urllib.parse.quote(url_string, safe=':/?=&')
         url = url.replace('%C3%A7', '%E7')  # Replace ç for %E7
-        print(url)
 
         return url
 
-    def _make_href_url(self, href):
+    def _make_href_url(self, url, href):
         """Preprocess the URL to be aceptable by the souplink.
 
         Args:
@@ -128,13 +128,13 @@ def _make_href_url(self, href):
             The desired url preprocessed.
 
         """
-        url = "http://www.buriti.df.gov.br/ftp/"
         url += href
         url = urllib.parse.quote(url, safe=':/?=&')
         url = url.replace('%C2', '')
         url = url.replace('3%8', '')
         url = url.replace('%C3%A7', '%E7')
         url = url.replace('%C3%A3', '%E3')
+        print(url)
 
         return url
 
@@ -224,7 +224,7 @@ def _download_pdf(self, url, path):
         except requests.exceptions.RequestException as error:
             self._fail_request_message(url, error)
         else:
-            pdf_file = Path(path)
+            pdf_file = Path(path + ".pdf")
             pdf_file.write_bytes(response.content)
             self._log("Finished " + os.path.basename(path))
 
@@ -286,28 +286,43 @@ def pull(self, start_date, end_date):
             month_path = self._make_month_path(year, actual_date)
             self._create_single_folder(month_path)
             url = self._make_url(actual_date)
-            a_list = self._get_soup_link(url)
+            soup_obj = self._get_soup_link(url)
+            select = soup_obj.find('select', attrs={
+                                     'class': 'chzn-select', 'data-placeholder': 'Selecione o Diário...'})
+            dodfs_list = select.find_all('option')
+
             year = actual_date.year
-            for a in a_list.find_all('a', href=True):
-                a_url = self._make_href_url(a['href'])
-                download_page = self._get_soup_link(a_url)
-                self._log("a_URL " + a_url)
-                number_of_files = int(download_page.find_all('b')[1].text)
+            for dodf in dodfs_list:
+                dodf_url = self._make_href_url(url, dodf.text)
+                download_page = self._get_soup_link(dodf_url)
+                self._log("a_URL " + dodf_url)
+                find_links = download_page.find_all('a')
+                number_of_files = len(find_links)
                 dodf_path = month_path
                 if number_of_files > 1:
-                    dodf_path = os.path.join(month_path, a.text)
+                    dodf_path = os.path.join(month_path, dodf.text)
                     self._create_single_folder(dodf_path)
 
-                for a_href in download_page.find_all('a', href=True):
-                    download_url = self._make_download_url(a_href['href'])
-                    dodf_name_path = os.path.join(dodf_path, a_href.text)
+                for link in find_links:
+                    download_url = self._make_download_url(link.get('href')[3:])
+                    dodf_name_path = os.path.join(dodf_path, link.text)
                     if not self._file_exist(dodf_name_path):
                         self._log("Downloding "
                                   + os.path.basename(dodf_name_path))
                         self._download_pdf(download_url, dodf_name_path)
                     else:
                         self._log("Jumping to the next")
 
+                # for a_href in download_page.find_all('a', href=True):
+                #     download_url = self._make_download_url(a_href['href'])
+                #     dodf_name_path = os.path.join(dodf_path, a_href.text)
+                #     if not self._file_exist(dodf_name_path):
+                #         self._log("Downloding "
+                #                   + os.path.basename(dodf_name_path))
+                #         self._download_pdf(download_url, dodf_name_path)
+                #     else:
+                #         self._log("Jumping to the next")
+
             self._prog_bar.update(1)
 
     def _log(self, message):

diff --git a/vitor.py b/vitor.py
@@ -0,0 +1,74 @@
+from bs4 import BeautifulSoup
+import os  # formataçao para OS em caminhos de arquivos
+import requests
+
+
+def download(url, endereco):
+    resposta = requests.get(url)  # retorna binarios que compoe o arq
+
+    try:
+        with open(endereco, 'wb') as novo_arquivo:  # wb: escrita e binario
+            novo_arquivo.write(resposta.content)
+        print('Download salvo em {}'.format(endereco))
+    except:
+        print('Ocorreu um Erro')
+
+
+MONTHS_STRING = ["", "01_Janeiro", "02_Fevereiro", "03_Mar%E7o", "04_Abril",
+                 "05_Maio", "06_Junho", "07_Julho", "08_Agosto",
+                 "09_Setembro", "10_Outubro", "11_Novembro", "12_Dezembro"]
+
+ano = int(input('Escolha um ano:'))
+mes = int(input('Escolha um mes:'))
+mes_format = ""
+
+for y in range(0, 13):
+    if(mes == y):
+        mes_format = MONTHS_STRING[y]
+
+
+# Base URL
+url_ = "http://www.buriti.df.gov.br/ftp/novo_portal_gdf/novo_dodf.asp?Ano={}&Mes={}&dir=".format(
+    ano, mes_format)
+
+
+r = requests.get(url_)
+soup = BeautifulSoup(r.content, 'html.parser')
+
+# Find options
+select = soup.find('select', attrs={
+    'class': 'chzn-select', 'data-placeholder': 'Selecione o Diário...'})
+options = select.find_all('option')
+
+
+# DODF choice
+for c in range(1, len(options)):
+    print('Opção {} : {}'.format(c, options[c].text))
+opcao = int(input('Escolha uma opção:'))
+txt = options[opcao].text
+txt_cadeia = txt.split()
+
+# Complete url formation
+for d in range(0, len(txt_cadeia)):
+    if(d == len(txt_cadeia)-1):
+        url_ = url_+txt_cadeia[d]
+    else:
+        url_ = url_+txt_cadeia[d]+'+'
+url_link = url_
+
+rr = requests.get(url_link)
+soup2 = BeautifulSoup(rr.content, 'html.parser')
+
+find_links = soup2.find_all('a')
+links = []
+
+base = "http://www.buriti.df.gov.br/ftp"
+for link in find_links:
+    links.append(base+link.get('href')[2:].replace(' ', '%20'))
+    print(link.get('href'))
+
+count = 0
+for url in links:
+    output = os.path.join('files/', 'file{}.pdf'.format(count))
+    download(url, output)
+    count += 1