In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Verificar IP

In [None]:
import requests

my_country = requests.get("https://api.myip.com/")
my_country.json()

{'ip': '35.199.61.18', 'country': 'United States', 'cc': 'US'}

# Extrair Input Do GoogleDrive

In [None]:
from zipfile import ZipFile
import os

def extrair(path, path_output):
  # loading the temp.zip and creating a zip object
  with ZipFile(path, 'r') as zObject:

      # Extracting all the members of the zip
      # into a specific location.
      zObject.extractall(
          path=path_output)


## Empenhos

In [None]:
os.mkdir("empenhos")
extrair(path="/content/gdrive/MyDrive/datasets/empenhos_input.zip",
        path_output="/content/empenhos")

## Liquidações

In [None]:
os.mkdir("liquidações")

In [None]:
extrair(path="/content/gdrive/MyDrive/datasets/liquidações_input.zip",
        path_output="/content/liquidações")

# Raspador

## Instalar Selenium

In [None]:
!pip install selenium
!pip install webdriver-manager
!apt-get update
!apt install chromium-chromedriver

## Extrair

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from time import sleep
import os
import shutil
import datetime as dt
import time


class TceExtraction:

    def __init__(self, number_category: int = 0,
                 city_names: list = None):

        category_name = {
            0: 'empenhos',
            1: 'liquidações',
            2: 'pagamentos'
        }

        self.path_input = os.path.join(
            os.getcwd(),
            category_name[number_category]
            , 'input')

        self.city_rotine = False
        self.city_names = city_names
        self.number_category = number_category
        self.drive = self.create_webdrive()
        self.action = self.create_action()

        self.wait = self.create_wait(60)
        self.wait_to_verific = self.create_wait(5)

        self.extraction()

    def create_webdrive(self):

        options = Options()
        options.add_argument('-headless')
        options.set_preference('browser.download.folderList', 2)
        options.set_preference('browser.download.manager.showWhenStarting', False)
        options.set_preference('browser.download.dir', self.path_input)
        options.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/x-gzip")
        options.add_argument('-no-sandbox')
        options.add_argument('-disable-dev-shm-usage')
        options.add_argument("-disable-blink-features=AutomationControlled")
        options.add_argument("-disable-extensions")
        options.add_argument("-incognito")
        drive = webdriver.Chrome(options=options)

        drive.get(
            f"https://paineistransparencia.tce.sc.gov.br/extensions/appDespesasMunicipaisExternoNovo/index.html")

        return drive

    def create_action(self):
        return ActionChains(self.drive)

    def create_wait(self, deplay: int):
        return WebDriverWait(self.drive, deplay)

    def wait_page_ready(self):
        self.wait.until(EC.presence_of_element_located((By.ID, 'incodataNavA05')))
        self.click_by_js(By.ID, 'incodataNavA05')

    def select_city(self, municipio_name: str) -> None:

        self.click_by_js(By.ID, 'clearselections')
        selector = 'div[class="qv-listbox-search"] > div > input'
        self.click_by_js(By.CLASS_NAME, 'title-wrapper')
        self.click_by_js(By.CSS_SELECTOR, 'button[tid="selection-toolbar.clear"]')

        self.drive.find_element(By.CSS_SELECTOR, selector).send_keys(municipio_name)

        self.wait_to_verific.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.highlighted')))

        self.click_by_mouse(
            By.XPATH, f'//span[@title="{municipio_name}"]//ancestor::li')

        self.click_by_js(By.CLASS_NAME, 'title-wrapper')

    def select_dates(self, by, match):

        for n in range(0, -2, -1):
            self.wait.until(EC.visibility_of_element_located((by, match)))
            self.drive.find_elements(by, match)[n].click()

    def download_button(self) -> None:

        self.wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "th#BEq-header-1")))

        old_quantity = len(os.listdir(self.path_input))
        self.click_by_js(By.ID, "GRA_1_Dow")

        self.wait.until(lambda x: self.wait_download_ends(old_quantity))

        self.drive.switch_to.window(self.drive.window_handles[0])

    def wait_download_ends(self, old_quantity: int) -> bool:

        files = os.listdir(self.path_input)
        return len(files) != old_quantity and "".join(files).count(".part") <= 0

    def rotine_download(self, attempt: int):

        sleep(1)
        self.click_filter_category()

        selector_data_list = [{
            "no_data": 'td.nodata',
            "data": 'td.available'
        }, {
            "no_data": 'div.dpleft td.nodata',
            "data": 'div.dpleft td.available'
        }]

        while True:

            if self.city_rotine:
                for n in range(2 - attempt):
                    try:
                        self.click_by_js(
                            By.CSS_SELECTOR, 'th[class="next available"]', wait=self.wait_to_verific)

                    except TimeoutException:

                        self.click_filter_category()
                        self.city_rotine = False
                        return True
            else:

                while True:

                    try:
                        self.click_by_js(
                            By.CSS_SELECTOR, 'th[class="prev available"]', wait=self.wait_to_verific)

                    except:

                        if attempt > 0:
                            self.click_by_js(
                                By.CSS_SELECTOR, 'th[class="next available"]', wait=self.wait_to_verific)
                        break

            no_data = self.drive.find_elements(
                By.CSS_SELECTOR, selector_data_list[attempt]['no_data'])

            data = self.drive.find_elements(
                By.CSS_SELECTOR, selector_data_list[attempt]['data'])

            self.city_rotine = True

            if len(data) != len(no_data):
                self.select_dates(By.CSS_SELECTOR,
                                  selector_data_list[attempt]['data'])
                return False

    def click_filter_category(self) -> None:

        selector_categorys = 'bootstrap_inside'

        category_elementes = self.wait_to_verific.until(
            EC.visibility_of_all_elements_located(
                (By.CLASS_NAME,
                 selector_categorys)))

        self.drive.execute_script("arguments[0].click();",
                                  category_elementes[self.number_category])

    def get_citys(self) -> list:

        selector_list_city = 'li > div[class ="qv-listbox-text qv-listbox-text-value"]'

        self.click_by_js(By.CSS_SELECTOR, 'h6[aria-label="Ente"]')

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, selector_list_city)))
        self.action.send_keys(Keys.TAB)
        self.action.perform()

        citys_names = []

        while True:

            citys = self.drive.find_elements(
                By.CSS_SELECTOR, selector_list_city)

            presenty_list = [city.text for city in citys if city.text]

            if citys_names and citys_names[-1] == presenty_list[-1]:
                break

            citys_names += presenty_list

            n_downs_listbox = 8 if len(citys_names) > 8 else 15
            for move in range(n_downs_listbox):
                self.action.send_keys(Keys.DOWN)
                self.action.perform()

        citys_names = list(dict.fromkeys(citys_names))

        self.click_by_js(By.CLASS_NAME, 'title-wrapper')

        return citys_names

    def check_for_real_date_by_city(self) -> bool:

        sleep(1)

        selections_item = self.wait_to_verific.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, 'div.current-selections-item')
            ))

        return len(selections_item) > 1

    def try_download_city_files(self, city_name: str,
                                attempt: int = 0) -> None:

        path_city_folder = os.path.join(self.path_input, self.limpar_name(city_name))

        if self.verify_downloaded_citys(path_city_folder):
            return None

        self.drive.execute_script(f'document.getElementById("Filtros").scrollTop = 0')

        self.select_city(city_name)

        scrolltopmax = self.drive.execute_script('return document.getElementById("Filtros").scrollTopMax')
        self.drive.execute_script(f'document.getElementById("Filtros").scrollTop = {scrolltopmax}')

        finish = False

        while not finish:

            finish = self.rotine_download(attempt)

            if not self.check_for_real_date_by_city():
                break

            self.download_button()

        self.move_files_to_city_folder(path_city_folder)

    def download_control(self) -> None:

        # self.wait_page_ready()
        sleep(120)

        if self.city_names is None:
            print("Coletando os municipios.")
            self.city_names = self.get_citys()
        print(self.city_names)

        for n, city_name in enumerate(self.city_names):

            start_time = time.time()
            print(f"{n + 1}/{len(self.city_names)}\nTentando baixar arquivos de: {city_name}")
            for attempt in range(2):
                try:

                    print(f"Tentativa: {attempt + 1}/2")
                    self.try_download_city_files(city_name, attempt)
                    print(f"Sucesso! {city_name}\nConcluido em: {(time.time() - start_time) / 60:.2f} minutos")
                    break

                except Exception as erro:

                    self.drive.refresh()
                    self.wait_page_ready()
                    print(f"Falha ao tentar baixar!\nErro: {erro}")

                finally:
                    self.city_rotine = False
                    self.delete_files()

    def verify_downloaded_citys(self, city_folder: str) -> bool:

        try:

            if os.listdir(city_folder):
                return True
            else:
                return False

        except FileNotFoundError:
            self.create_folders([city_folder])
            return False

    def move_files_to_city_folder(self, city_folder: str) -> None:

        files = [file for file in os.listdir(self.path_input)
                 if file.count('.csv') or file.count('.xlsx')]
        for f in files:
            path_file_to_move = os.path.join(self.path_input, f)
            shutil.move(path_file_to_move, city_folder)

    def delete_files(self):

        files = os.listdir(self.path_input)
        for f in files:
            try:
                os.remove(os.path.join(self.path_input, f))
            except (IsADirectoryError, PermissionError):
                pass

    def extraction(self) -> None:

        try:
            self.create_folders([self.path_input, "output"])
            self.download_control()

        finally:
            sleep(5)
            self.delete_files()
            self.drive.quit()

    def click_by_js(self, by, match, wait=None) -> None:

        if wait is None:
            wait = self.wait

        where_click = wait.until(
            EC.visibility_of_element_located((by, match)))
        self.drive.execute_script("arguments[0].click();", where_click)

    def click_by_mouse(self, by, match) -> None:

        where_click = self.wait.until(
            EC.visibility_of_element_located((by, match)))

        self.action.move_to_element_with_offset(where_click, 5, 5)
        self.action.click()
        self.action.perform()

    @staticmethod
    def create_folders(name_folders: list) -> bool:
        for folder in name_folders:
            try:
                os.makedirs(folder)
                return False
            except FileExistsError:
                return True

    @staticmethod
    def limpar_name(name: str,
                    place: str = "-",
                    remove_list: list = None) -> str:

        if remove_list is None:
            remove_list = [r"//", r"\\", "/"]

        for remove in remove_list:
            name = name.replace(remove, place)

        return name

In [None]:
city_names = ['JOINVILLE',
              'CONSÓRCIO INTERMUNICIPAL DE SAÚDE DO NORDESTE DE SANTA CATARINA - CIS-NORDESTE/SC',
              'CONSÓRCIO INTERMUNICIPAL MULTIFINALITÁRIO DA REGIÃO DA AMUNESC',
              'PERITIBA',
              'GUABIRUBA',
              'VITOR MEIRELES',
              'TANGARÁ',
              'GARUVA',
              'PAPANDUVA',
              'CORREIA PINTO',
              'LAGUNA',
              'QUILOMBO',
              'JOAÇABA',
              'IRINEÓPOLIS',
              'AGRONÔMICA',
              'GRÃO-PARÁ',
              'VIDEIRA',
              'ITAPIRANGA',
              'PALMITOS',
              'RIO DO OESTE',
              'RIO DAS ANTAS',
              'UNIÃO DO OESTE',]

In [None]:
if __name__ == '__main__':

    start_time = time.time()
    print(f"Iniciado: {dt.datetime.now()}")
    TceExtraction(number_category=0) #0: empenho, 1: liquidacao, 2: pagamento
    print(f"Finalizando: {dt.datetime.now()}\nDuração da execução: {(time.time() - start_time) / 60:.2f}")

# Colocar Input em ZIP no GoogleDrive


## Empenhos

In [None]:
import shutil
shutil.make_archive("/content/gdrive/MyDrive/datasets/empenhos_ente_22",
                    'zip',
                    "/content/empenhos")

'/content/gdrive/MyDrive/datasets/empenhos_ente_22.zip'

## Liquidações

In [None]:
import shutil
shutil.make_archive("/content/gdrive/MyDrive/datasets/liquidações_ente_22",
                    'zip',
                    "/content/liquidações")

'/content/gdrive/MyDrive/datasets/liquidações_ente_22.zip'

## Pagamentos

In [None]:
import shutil
shutil.make_archive("/content/gdrive/MyDrive/datasets/pagamentos_ente_22",
                    'zip',
                    "/content/pagamentos")

'/content/gdrive/MyDrive/datasets/pagamentos_ente_22.zip'