In [207]:
import requests
import zipfile
import os
import pandas as pd

In [219]:
def download_base_datasets(dataset_links: list[str], zip_file_names: list[str], data_path: str) -> None:
    
    for i,dataset_link in enumerate(dataset_links):
        
        zip_file_path = f"{data_path}/{zip_file_names[i]}"
        response = requests.get(dataset_link)
        
        with open(zip_file_path, 'wb') as f:
            f.write(response.content)

        # Unzip the file and extract only the file that ends with 'BRASIL.csv'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # List of files in the zip
            zip_file_list = zip_ref.namelist()
            
            # Find the file that ends with 'BRASIL.csv'
            for file in zip_file_list:
                if file.endswith("BRASIL.csv"):
                    # Extract only the BRASIL.csv file
                    zip_ref.extract(file, path=data_path)
                    print(f"Extracted: {file}")

        # Clean up by removing the zip file after extraction
        os.remove(zip_file_path)
        
        
def find_file_by_substring(dir: str, substring: str) -> str:
    # Get all file names in the folder
    file_names = os.listdir(dir)
    file_path = ""
    # Print all file names
    for file_name in file_names:
        if substring in file_name:
            file_path = f"{dir}/{file_name}"
            return file_path
    # Step 3: Read the CSV file with the correct encoding (e.g., ISO-8859-1 or windows-1252)
    return file_path
    

In [216]:
TSE_DATASET = "https://dadosabertos.tse.jus.br/dataset/resultados-2024"
RELACAO_ZN_SC = "https://resultados.tse.jus.br/oficial/ele2024/arquivo-urna/452/config/rs/rs-p000452-cs.json"
RELACAO_PARTIDO_CANDIDATO = "https://qrcodenobu.tse.jus.br/json-bu/oficial/439/o00452rs88013-qbu.js"

SECTION_DETAILS_DATASET_LINK = "https://cdn.tse.jus.br/estatistica/sead/odsele/detalhe_votacao_secao/detalhe_votacao_secao_2024.zip"
CANDIDATE_DETAILS_DATASET_LINK = "https://cdn.tse.jus.br/estatistica/sead/odsele/votacao_candidato_munzona/votacao_candidato_munzona_2024.zip"
dataset_links = [SECTION_DETAILS_DATASET_LINK, CANDIDATE_DETAILS_DATASET_LINK]

# Path to save the downloaded zip file
section_zip_file_path = "SECTION_DETAILS_DATASET.zip"
candidate_zip_file_path = "CANDIDATE_DETAILS_DATASET.zip"
zip_files_path = [section_zip_file_path, candidate_zip_file_path]
DATA_PATH = "DATA"

Extracted: detalhe_votacao_secao_2024_BRASIL.csv
Extracted: votacao_candidato_munzona_2024_BRASIL.csv


In [220]:
all_candidates_file_sub_str = "candidato"
all_sections_file_sub_str = "secao"
download_base_datasets(dataset_links, zip_files_path, DATA_PATH)

candidates_file_path = find_file_by_substring(DATA_PATH, all_candidates_file_sub_str)
sections_file_path = find_file_by_substring(DATA_PATH, all_sections_file_sub_str)
print(candidates_file_path)
print(sections_file_path)

Extracted: detalhe_votacao_secao_2024_BRASIL.csv
Extracted: votacao_candidato_munzona_2024_BRASIL.csv
DATA/votacao_candidato_munzona_2024_BRASIL.csv
DATA/detalhe_votacao_secao_2024_BRASIL.csv


In [222]:
candidates_df = pd.read_csv(candidates_file_path, encoding='ISO-8859-1',sep=';',dtype=str)
sections_df = pd.read_csv(sections_file_path, encoding='ISO-8859-1',sep=';',dtype=str)
candidates_df.info()
sections_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718820 entries, 0 to 718819
Data columns (total 50 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   DT_GERACAO                 718820 non-null  object
 1   HH_GERACAO                 718820 non-null  object
 2   ANO_ELEICAO                718820 non-null  object
 3   CD_TIPO_ELEICAO            718820 non-null  object
 4   NM_TIPO_ELEICAO            718820 non-null  object
 5   NR_TURNO                   718820 non-null  object
 6   CD_ELEICAO                 718820 non-null  object
 7   DS_ELEICAO                 718820 non-null  object
 8   DT_ELEICAO                 718820 non-null  object
 9   TP_ABRANGENCIA             718820 non-null  object
 10  SG_UF                      718820 non-null  object
 11  SG_UE                      718820 non-null  object
 12  NM_UE                      718820 non-null  object
 13  CD_MUNICIPIO               718820 non-null  

In [226]:
CANDIDATES_NAME_NUMBER_MAP = candidates_df[["NM_URNA_CANDIDATO", "NR_CANDIDATO", "CD_MUNICIPIO","DS_CARGO"]].drop_duplicates()
base_information_dataframe = sections_df[["CD_ELEICAO","NR_ZONA","NR_SECAO","SG_UF","NM_MUNICIPIO","CD_MUNICIPIO","NM_LOCAL_VOTACAO","DS_LOCAL_VOTACAO_ENDERECO"]]
base_information_dataframe.head()
CANDIDATES_NAME_NUMBER_MAP.head()

Unnamed: 0,NM_URNA_CANDIDATO,NR_CANDIDATO,CD_MUNICIPIO,DS_CARGO
0,BRUNO TEIXEIRA,40,27952,Prefeito
1,GISELDA CARVALHO,13,39039,Prefeito
2,EDER DE NILDA,10,39330,Prefeito
3,DR. ISMAEL DO EXPEDITO,11,15199,Prefeito
4,VANINHO MENDES,40,57991,Prefeito


In [242]:

full_name_pattern = r'^\d+\s+[\wÀ-ÖØ-öø-ÿ]+(\s+[\wÀ-ÖØ-öø-ÿ]+)*$'
digit_only_pattern = r'^\d+.*$'

BASE_URL = "https://resultados.tse.jus.br/oficial/app/index.html#/eleicao;"
LAST_URL = "/dados-de-urna/boletim-de-urna"

CD_ELEICAO = base_information_dataframe["CD_ELEICAO"].iloc[0]
SG_UF_LIST = base_information_dataframe["SG_UF"].unique()
for uf in SG_UF_LIST:
    # Filter the DataFrame where 'uf' equals 'SG_UF'
    filtered_uf_df = base_information_dataframe[base_information_dataframe['SG_UF'] == uf]
    CD_MU_LIST = filtered_uf_df["CD_MUNICIPIO"].unique()

    for cd_mu in CD_MU_LIST:
        filtered_mu_df = filtered_uf_df[filtered_uf_df['CD_MUNICIPIO'] == cd_mu]
        ZN_LIST = filtered_mu_df["NR_ZONA"].unique()

        for nr_zona in ZN_LIST:
            filtered_zn_df = filtered_mu_df[filtered_mu_df['NR_ZONA'] == nr_zona]
            NS_LIST = filtered_zn_df["NR_SECAO"].unique()
            
            driver = fetch_firefox_driver()
            for nr_secao in NS_LIST:
                filters = f"e=e{CD_ELEICAO};uf={uf.lower()};mu={cd_mu};ufbu={uf.lower()};mubu={cd_mu};zn={nr_zona};se={nr_secao}"
                full_url = f"{BASE_URL}{filters}{LAST_URL}"
                print(full_url)
                
                candidates_roles_elements = fetch_candidates_elements(full_url, driver, digit_only_pattern)
                candidates = fetch_ballot_box_candidates_information(candidates_roles_elements, nr_secao, nr_zona,cd_mu)
                print(candidates)
            driver.quit()
            break
        break
    break        

https://resultados.tse.jus.br/oficial/app/index.html#/eleicao;e=e619;uf=ac;mu=1007;ufbu=ac;mubu=1007;zn=9;se=137/dados-de-urna/boletim-de-urna
[{'NR_CANDIDATO': '10000', 'TOTAL_VOTOS': '3', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10100', 'TOTAL_VOTOS': '3', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10111', 'TOTAL_VOTOS': '12', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10123', 'TOTAL_VOTOS': '4', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10222', 'TOTAL_VOTOS': '3', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10333', 'TOTAL_VOTOS': '4', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10444', 'TOTAL_VOTOS': '7', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CANDIDATO': '10555', 'TOTAL_VOTOS': '2', 'NR_ZONA': '9', 'NR_SECAO': '137', 'CD_MUNICIPIO': '1007'}, {'NR_CA

KeyboardInterrupt: 

In [183]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import time
import re
from itertools import chain

In [241]:
def fetch_firefox_driver(driver_options = None):
    if not driver_options:
        # Set up headless Firefox
        driver_options = Options()
        #driver_options.add_argument("--headless")
        # Initialize WebDriver with headless options

        # Set Firefox preferences to allow geolocation automatically
        driver_options = webdriver.FirefoxProfile()
        driver_options.set_preference("geo.prompt.testing", True)
        driver_options.set_preference("geo.prompt.testing.allow", True)
        driver_options.set_preference('dom.webdriver.enabled', False)
        driver_options.set_preference('useAutomationExtension', False)
        driver_options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0")

        driver_options.profile = profile
    
    driver = webdriver.Firefox(options=options)
    return driver


def has_valid_candidate(driver, candidate_name_pattern):
    """
    Checks if the page contains any valid candidate elements.
    """
    p_elements = driver.find_elements(By.TAG_NAME, "p")
    
    for p_element in p_elements:
        candidate_name_cleaned = p_element.text.strip().upper()
        if re.match(candidate_name_pattern, candidate_name_cleaned, re.UNICODE):
            return True  # Valid candidate found
    return False  # No valid candidates found yet


def wait_for_valid_candidate(driver, candidate_name_pattern):
    """
    Waits until at least one valid candidate (based on name pattern) is found on the page.
    """
    # Try to find valid candidate in the list of elements
    WebDriverWait(driver, 5).until(lambda d: has_valid_candidate(d, candidate_name_pattern))


def load_ballot_box_page(url: str, driver, candidate_name_pattern, max_retries=3):
    driver.get(url)
    # Wait for the main element to load (initial load)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "captureDiv"))
    )

    # Retry logic for refreshing and waiting for valid candidates
    retries = 0
    while retries < max_retries:
        try:
            # Optionally refresh the page to make sure it's fully loaded
            time.sleep(1)
            driver.refresh()

            # Wait for the element with candidates to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "cargo-fixo"))
            )

            # Wait until valid candidates are present
            wait_for_valid_candidate(driver, candidate_name_pattern)
            
            # If we reach this point, valid candidates were found, return the driver
            return driver
        
        except Exception as e:
            # Increment retry counter
            retries += 1
            print(f"Retry {retries} failed. Reason: {str(e)}")
        
    # If we exhaust retries, raise an exception or handle it as needed
    raise Exception(f"Failed to load valid candidates after {max_retries} retries.")
    

def fetch_ballot_box_roles_candidates_html(driver):
    rendered_html = driver.page_source
    soup = BeautifulSoup(rendered_html, 'html.parser')
    ballot_box_information_html = soup.find(id='captureDiv')
    roles_candidates = ballot_box_information_html.find_all(class_='cargo-fixo')
    return roles_candidates


def fetch_candidates_elements(full_url:str, driver, candidate_name_pattern):
    candidates_elements_by_role = {}
    try:
        driver = load_ballot_box_page(full_url,driver,candidate_name_pattern)
        roles_candidates = fetch_ballot_box_roles_candidates_html(driver)

        for role in roles_candidates:
            
            # parent can be a table with VEREADOR or PREFEITO
            parent = role.find_parent()
            
            # P elements represents the name of each candidate
            p_elements = parent.find_all('p')
            
            # H1 represents candidate role title
            position_title = role.find('h1')
            
            for p_element in p_elements:
                name = clean_candidate_name_from_element(p_element).strip()
                if re.match(candidate_name_pattern, name, re.UNICODE):
                    role_name = position_title.text.strip().upper()
                    candidates_elements_by_role[role_name] = candidates_elements_by_role.get(role_name,[]) + [p_element]
        
        return candidates_elements_by_role
    except Exception as e:
        print(e)


def clean_candidate_name_from_element(p_element):
    candidate_name_cleaned = p_element.text.strip().upper()
    return candidate_name_cleaned


def collect_candidate_number_from_element(p_element):
    candidate_name_cleaned = clean_candidate_name_from_element(p_element)
    candidate_name_splitted = candidate_name_cleaned.split(" ")
    NR_CANDIDATO = candidate_name_splitted[0]
    return NR_CANDIDATO


def collect_candidate_name_from_element(p_element):
    candidate_name_cleaned = clean_candidate_name_from_element(p_element)
    candidate_name_splitted = candidate_name_cleaned.split(" ")
    NM_CANDIDATO = None
    if len(candidate_name_splitted) > 1:
        NM_CANDIDATO = " ".join(candidate_name_splitted[1:])
    return NM_CANDIDATO


def collect_candidate_votes_from_element(p_element) -> str:
    """Collects 

    Args:
        p_element (_type_): _description_

    Returns:
        dict[str,str] | None: _description_
    """
    
    p_parent = p_element.find_parent()
    votes_title_element = p_parent.find(class_='titulo-sm')
    total_votes_element = votes_title_element.find_next_sibling()
    TOTAL_VOTOS = total_votes_element.text.strip()
    
    return TOTAL_VOTOS

    
def fetch_ballot_box_candidates_information(candidates_role_elements: dict,  nr_secao: str, nr_zona: str, cd_mu: str) -> list[dict[str,str]]:
    
    candidates = []
    
    for role, p_elements in candidates_role_elements.items():
        for p_element in p_elements:
            candidate_nr = collect_candidate_number_from_element(p_element)
            candidate_votes = collect_candidate_votes_from_element(p_element)
            candidates.append(
                {
                    "NR_CANDIDATO": candidate_nr,
                    "TOTAL_VOTOS": candidate_votes,
                    "NR_ZONA": nr_zona,
                    "NR_SECAO": nr_secao,
                    "CD_MUNICIPIO": cd_mu,
                }
            )
        
    return candidates



In [199]:
test = "02231"
test.split(" ")

['02231']