In [71]:
#table to be used as loc dimension

In [72]:
import csv
import json
import logging
from pathlib import Path
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
import pandas as pd
import gzip

In [73]:
# Configurações de logging e constantes
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

URL = "https://servicodados.ibge.gov.br/api/v1/localidades/municipios"
CITIES = "cities"
STATES = "states"
HEADERS = {CITIES: ("code", "name", "state"), STATES: ("code", "abbr", "name")}
FORMATS = ("csv", "parquet")

In [74]:
def fetch_ibge_data(state_filter=None):
    """
    Função para extrair dados do IBGE e processar cidades e estados.
    Utiliza um gerador para evitar sobrecarregar a memória com grandes volumes de dados.
    Lida com possíveis respostas comprimidas.

    Parâmetros:
    state_filter: String opcional para filtrar os dados por estado (UF).
                  Se None, retorna todos os estados e municípios.
    """
    states_yielded = set()
    
    logging.info(f"Fetching data from {URL}…")

    try:
        request = Request(URL)
        request.add_header('Accept-encoding', 'gzip') 
        with urlopen(request) as response:
            if response.info().get('Content-Encoding') == 'gzip':
                logging.info("Response is compressed with gzip.")
                with gzip.GzipFile(fileobj=response) as decompressed:
                    data = json.loads(decompressed.read().decode('utf-8'))
            else:
                data = json.loads(response.read().decode('utf-8'))
    except HTTPError as e:
        logging.error(f"HTTP error occurred: {e.code} - {e.reason}")
        return
    except URLError as e:
        logging.error(f"Failed to reach the server: {e.reason}")
        return
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return

    for obj in data:
        city = {
            "code": str(obj["id"])[:6],
            "name": obj["nome"],
            "state": obj["microrregiao"]["mesorregiao"]["UF"]["sigla"],
        }

        # Aplica o filtro de estado se estiver definido
        if state_filter and city["state"] != state_filter:
            continue

        yield (CITIES, city)

        if city["state"] in states_yielded:
            continue

        state = {
            "abbr": city["state"],
            "code": obj["microrregiao"]["mesorregiao"]["UF"]["id"],
            "name": obj["microrregiao"]["mesorregiao"]["UF"]["nome"],
        }
        yield (STATES, state)
        states_yielded.add(state["abbr"])

In [75]:
class DataWriter:
    """
    Classe responsável por salvar os dados em arquivos CSV e Parquet.
    Implementa boas práticas de modulação, com separação clara de responsabilidades.
    """
    
    def __init__(self, data_generator, output_dir):
        self.data_generator = data_generator
        self.data = {CITIES: [], STATES: []}
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)  # Cria diretórios se não existirem
        self.paths = self._generate_paths()

    def _generate_paths(self):
        """
        Gera os caminhos para salvar os arquivos CSV e Parquet.
        """
        return {
            f"{name}.{ext}": self.output_dir / f"{name}.{ext}"
            for name in HEADERS for ext in FORMATS
        }

    def _sort_data(self):
        """
        Ordena os dados por nome para facilitar a leitura.
        """
        logging.info("Sorting data…")
        for name in HEADERS:
            self.data[name] = sorted(self.data[name], key=lambda row: row["name"])

    def _write_csv(self):
        """
        Salva os dados no formato CSV.
        """
        for name, headers in HEADERS.items():
            csv_path = self.paths[f"{name}.csv"]
            with csv_path.open("w", encoding="utf-8", newline='') as file:
                writer = csv.DictWriter(file, fieldnames=headers)
                writer.writeheader()
                for line in self.data[name]:
                    writer.writerow(line)
            logging.info(f"CSV file saved: {csv_path}")

    def _write_parquet(self):
        """
        Salva os dados no formato Parquet.
        """
        for name in HEADERS:
            parquet_path = self.paths[f"{name}.parquet"]
            df = pd.DataFrame(self.data[name])
            df.to_parquet(parquet_path, index=False)
            logging.info(f"Parquet file saved: {parquet_path}")

    def _cleanup_existing_files(self):
        """
        Remove arquivos antigos antes de salvar novos arquivos.
        """
        for path in self.paths.values():
            if path.exists():
                path.unlink()
                logging.info(f"Deleted existing file: {path}")

    def save_data(self):
        """
        Método principal para iniciar o processo de salvar dados.
        """
        for name, row in self.data_generator:
            self.data[name].append(row)

        self._sort_data()
        self._cleanup_existing_files()
        self._write_csv()
        # self._write_parquet()

In [76]:
def main(output_dir, state_filter=None):
    """
    Função principal para orquestrar a extração e salvamento dos dados.
    O parâmetro output_dir define o diretório onde os arquivos CSV e Parquet serão salvos.
    O parâmetro state_filter define se deve filtrar por um estado específico.
    """
    data_generator = fetch_ibge_data(state_filter)
    if data_generator:
        writer = DataWriter(data_generator, output_dir)
        writer.save_data()

In [77]:
if __name__ == "__main__":
    output_dir = "../../dataDiscovery/ibge/sample/" 
    state_filter = "AC"  
    main(output_dir, state_filter)
    logging.info("Process completed successfully!")

2024-09-15 17:57:44,039 - INFO - Fetching data from https://servicodados.ibge.gov.br/api/v1/localidades/municipios…
2024-09-15 17:57:44,253 - INFO - Response is compressed with gzip.
2024-09-15 17:57:44,468 - INFO - Sorting data…
2024-09-15 17:57:44,469 - INFO - Deleted existing file: ..\..\dataDiscovery\ibge\sample\cities.csv
2024-09-15 17:57:44,470 - INFO - Deleted existing file: ..\..\dataDiscovery\ibge\sample\states.csv
2024-09-15 17:57:44,471 - INFO - CSV file saved: ..\..\dataDiscovery\ibge\sample\cities.csv
2024-09-15 17:57:44,472 - INFO - CSV file saved: ..\..\dataDiscovery\ibge\sample\states.csv
2024-09-15 17:57:44,473 - INFO - Process completed successfully!
