In [90]:
import os, sys
sys.path.insert(0, os.path.abspath('../src'))
%load_ext autoreload
%autoreload 2

import yaml, pandas as pd
from dotmap import DotMap

from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from model.document import Cnpj

def generator(df):
    for _, row in df.iterrows():
        yield row

with open('../src/app_config.yml', encoding='utf-8') as f:
    content = yaml.safe_load(f)
config = DotMap(content, _dynamic=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
input_file  = os.path.join(config.folder_paths.input, 'input.csv')
output_file = os.path.join(config.folder_paths.output, 'result.xlsx')
url = config.url

In [92]:
input_df = pd.read_csv(input_file)
input_df.sample(5)

Unnamed: 0,CNPJs
84,17.138.140/0001-23
33,18.715.615/0001-60
73,32.384.344/0001-38
69,03.133.408/0001-20
21,03.389.126/0001-98


In [93]:
# Montando o generator do dataframe para iterarmos utilizando o notebook
queue = generator(input_df)

In [94]:
documents = []

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
wait   = WebDriverWait(driver, 10)

## Loop de Iteração do DataFrame

In [95]:
item = next(queue)
item

CNPJs    17.516.113/0001-47
Name: 0, dtype: object

In [96]:
driver.get(url)
driver.find_element(By.XPATH, '//*[@id="captchaSonoro"]').click()

wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="cnpj"]')))
element = driver.find_element(By.XPATH, '//*[@id="cnpj"]') # .send_keys(item.CNPJs)
script = f"""
    var element = arguments[0];
    var text = "{item.CNPJs}";
    element.value = text;
    element.dispatchEvent(new Event('input'));
"""
driver.execute_script(script, element)

In [97]:
driver.find_element(By.XPATH, '//*[@id="txtTexto_captcha_serpro_gov_br"]').send_keys('gkFNHv')

Preencher Captcha

In [98]:
driver.find_element(By.XPATH, '//*[@id="frmConsulta"]/div[3]/div/button[1]').click()

In [99]:
# Capturar falha de servidor.
element = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/input')
if element.is_displayed():
    print("Server Error - 404")

In [79]:
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="principal"]')))
main_table_xpath = '//*[@id="principal"]/table[1]/tbody/tr/td/'

documents.append(Cnpj(
    data_abertura            = driver.find_element(By.XPATH, f'{main_table_xpath}table[2]/tbody/tr/td[3]/font[2]/b').text,
    nome_empresarial         = driver.find_element(By.XPATH, f'{main_table_xpath}table[3]/tbody/tr/td/font[2]/b').text,
    nome_estabelecimento     = driver.find_element(By.XPATH, f'{main_table_xpath}table[4]/tbody/tr/td[1]/font[2]/b').text,
    codigo_desc_principal    = driver.find_element(By.XPATH, f'{main_table_xpath}table[5]/tbody/tr/td/font[2]/b').text,
    codigo_desc_secundaria   = driver.find_element(By.XPATH, f'{main_table_xpath}table[6]/tbody/tr/td/font[2]/b').text,
    codigo_desc_natureza_jur = driver.find_element(By.XPATH, f'{main_table_xpath}table[7]/tbody/tr/td/font[2]/b').text,
    logradouro               = driver.find_element(By.XPATH, f'{main_table_xpath}table[8]/tbody/tr/td[1]/font[2]/b').text,
    numero                   = driver.find_element(By.XPATH, f'{main_table_xpath}table[8]/tbody/tr/td[3]/font[2]/b').text,
    complemento              = driver.find_element(By.XPATH, f'{main_table_xpath}table[8]/tbody/tr/td[5]/font[2]/b').text,
    cep                      = driver.find_element(By.XPATH, f'{main_table_xpath}table[9]/tbody/tr/td[1]/font[2]/b').text,
    ente_federativo          = driver.find_element(By.XPATH, f'{main_table_xpath}table[11]/tbody/tr/td/font[2]/b').text,
    situacao_cadastral       = driver.find_element(By.XPATH, f'{main_table_xpath}table[12]/tbody/tr/td[1]/font[2]/b').text,
    data_situacao_cadastral  = driver.find_element(By.XPATH, f'{main_table_xpath}table[12]/tbody/tr/td[3]/font[2]/b').text
))

## Fim do Loop

In [None]:
# Salvando resultados
data_dict_list = []

for document in documents:
    data_dict_list.append({
        "Data Abertura": document.data_abertura,
        "Nome Empresarial": document.nome_empresarial,
        "Nome Estabelecimento": document.nome_estabelecimento,
        "Código Desc. Principal": document.codigo_desc_principal,
        "Código Desc. Secundária": document.codigo_desc_secundaria,
        "Código Desc. Natureza Jurídica": document.codigo_desc_natureza_jur,
        "Logradouro": document.logradouro,
        "Número": document.numero,
        "Complemento": document.complemento,
        "CEP": document.cep,
        "Ente Federativo": document.ente_federativo,
        "Situação Cadastral": document.situacao_cadastral,
        "Data Situação Cadastral": document.data_situacao_cadastral
    })

df = pd.DataFrame(data_dict_list)
df

In [None]:
df.to_excel(output_file, index=False)