# Camada Bronze - Arquitetura Medalhão

**Objetivo:** Ingestão de dados brutos da camada Landing para Bronze em formato Delta Lake.

**Operações:**
- Leitura de arquivos CSV do volume Landing
- Adição de coluna ingestion_timestamp
- Gravação em tabelas Delta gerenciadas
- Ingestão de cotação do dólar via API BCB


## Setup Inicial

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS medalhao;

In [0]:
%sql
USE CATALOG medalhao;
    
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;

In [0]:
%sql
USE SCHEMA default;
CREATE VOLUME IF NOT EXISTS landing;

## Configuração Inicial

In [0]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, col, max
import pyspark.sql.functions as F
import requests
import pandas as pd

In [None]:
catalog_name = "medalhao"
bronze_db_name = "bronze"
silver_db_name = "silver"

landing_volume = "landing"

base_path = f"/Volumes/{catalog_name}/default/{landing_volume}"

In [0]:
spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"USE SCHEMA {bronze_db_name}")

## Ingestão de Arquivos CSV

In [0]:
def ingest_csv(file_name: str, table_name: str):
    file_path = f"{base_path}/{file_name}"
    full_table_name = f"{catalog_name}.{bronze_db_name}.{table_name}"
    
    print(f"Ingestão: {file_name} → {full_table_name}")
    
    try:
        df = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load(file_path)
        
        df = df.withColumn("ingestion_timestamp", current_timestamp())
        
        df.write.format("delta") \
            .mode("overwrite") \
            .saveAsTable(full_table_name)
        
        print(f"Sucesso: {full_table_name} ({df.count()} registros)")
    
    except Exception as e:
        print(f"Erro: {file_name} - {e}")

ingest_csv("olist_customers_dataset.csv", "ft_consumidores")
ingest_csv("olist_geolocation_dataset.csv", "ft_geolocalizacao")
ingest_csv("olist_order_items_dataset.csv", "ft_itens_pedidos")
ingest_csv("olist_order_payments_dataset.csv", "ft_pagamentos_pedidos")
ingest_csv("olist_order_reviews_dataset.csv", "ft_avaliacoes_pedidos")
ingest_csv("olist_orders_dataset.csv", "ft_pedidos")
ingest_csv("olist_products_dataset.csv", "ft_produtos")
ingest_csv("olist_sellers_dataset.csv", "ft_vendedores")
ingest_csv("product_category_name_translation.csv", "dm_categoria_produtos_traducao")

print("Ingestão CSV concluída")

## Validação de Tabelas Bronze

In [0]:
def preview_tables(tables: list, limit_rows: int = 5):
    for table in tables:
        full_table_name = f"{catalog_name}.{bronze_db_name}.{table}"
        print(f"Tabela: {full_table_name}")
        try:
            df = spark.table(full_table_name).limit(limit_rows)
            display(df)
        except Exception as e:
            print(f"Erro: {full_table_name} - {e}")

bronze_tables = [
    "ft_consumidores",
    "ft_geolocalizacao",
    "ft_itens_pedidos",
    "ft_pagamentos_pedidos",
    "ft_avaliacoes_pedidos",
    "ft_pedidos",
    "ft_produtos",
    "ft_vendedores",
    "dm_categoria_produtos_traducao"
]

preview_tables(bronze_tables)


## Ingestão de Cotação do Dólar - API BCB

In [0]:
table_full_name = f"{catalog_name}.{bronze_db_name}.dm_cotacao_dolar"

API_OUTPUT_FORMAT = "%m-%d-%Y" 
POSSIBLE_INPUT_FORMATS = ["%Y-%m-%d", "%d/%m/%Y", "%m-%d-%Y", "%Y/%m/%d"]

In [0]:
def normalize_widget_date(date_str: str) -> str:
    if not date_str:
        raise ValueError(f"Data vazia: '{date_str}'")
        
    for fmt in POSSIBLE_INPUT_FORMATS:
        try:
            dt_obj = datetime.strptime(date_str, fmt)
            return dt_obj.strftime(API_OUTPUT_FORMAT)
        except ValueError:
            continue
            
    raise ValueError(f"Formato de data inválido: '{date_str}'")

raw_start_date = "2010-01-01" 
raw_end_date = datetime.now().strftime("%Y-%m-%d")

start_date_formatted = normalize_widget_date(raw_start_date)
end_date_formatted = normalize_widget_date(raw_end_date)

print(f"Período de extração: {start_date_formatted} até {end_date_formatted}")



In [0]:
ENDPOINT = (
    "https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/odata/"
    f"CotacaoDolarPeriodo(dataInicial=@dataInicial,dataFinalCotacao=@dataFinalCotacao)?"
    f"@dataInicial='{start_date_formatted}'&@dataFinalCotacao='{end_date_formatted}'"
    f"&$select=dataHoraCotacao,cotacaoCompra&$format=json"
)

resp = requests.get(ENDPOINT, timeout=30)
resp.raise_for_status()

payload = resp.json()
records = payload.get("value", [])

if not records:
    print("API retornou 0 registros")
    spark_df = spark.createDataFrame([], schema='dataHoraCotacao:string, cotacaoCompra:string')
else:
    df_pd = pd.DataFrame(records)
    
    if "cotacaoCompra" in df_pd.columns:
        df_pd = df_pd.rename(columns={"cotacaoCompra": "purchase_rate"})
    
    if "dataHoraCotacao" not in df_pd.columns:
        df_pd["dataHoraCotacao"] = None 

    spark_df = spark.createDataFrame(df_pd)
    spark_df = spark_df.withColumn("ingestion_timestamp", current_timestamp())
    
    print(f"Extraídos: {spark_df.count()} registros")
    display(spark_df.limit(5))



In [0]:
if spark_df.count() == 0:
    print("Nenhuma cotação para carregar")
else:
    spark_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_full_name)

    print(f"Salvo: {table_full_name} ({spark_df.count()} registros)")

    display(spark.table(table_full_name).orderBy(col("dataHoraCotacao").desc()).limit(10))


## Validação Final

In [None]:
bronze_tables_all = [
    "ft_consumidores",
    "ft_geolocalizacao",
    "ft_itens_pedidos",
    "ft_pagamentos_pedidos",
    "ft_avaliacoes_pedidos",
    "ft_pedidos",
    "ft_produtos",
    "ft_vendedores",
    "dm_categoria_produtos_traducao",
    "dm_cotacao_dolar"
]

print("="*80)
print("RESUMO - CAMADA BRONZE")
print("="*80)

for table in bronze_tables_all:
    full_name = f"{catalog_name}.{bronze_db_name}.{table}"
    try:
        count = spark.table(full_name).count()
        print(f"{table}: {count:,} registros")
    except Exception as e:
        print(f"{table}: ERRO - {e}")

print("="*80)