#Processamento de Raw XML to bronze

In [0]:
# ================================================================
# Bronze - Importar XML (IBGE)
# ================================================================
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime
import xml.etree.ElementTree as ET
import re


In [0]:
spark = SparkSession.builder.appName("Processamento_raw_xml_to_bronze").getOrCreate()

In [0]:
# ================================================================
# Par칙metro recebido via Job Databricks
# ================================================================
# ---------- widgets (params)
# ---------- widgets
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("file_prefix", "")
dbutils.widgets.text("data_ref_carga", "")

catalog        = dbutils.widgets.get("catalog")
schema_out     = dbutils.widgets.get("schema")
file_prefix    = dbutils.widgets.get("file_prefix")
data_ref_carga = dbutils.widgets.get("data_ref_carga")

if not (catalog and schema_out and file_prefix and data_ref_carga):
    raise ValueError("Par칙metros obrigat칩rios ausentes.")

file_prefix_clean = re.sub(r"[^0-9a-zA-Z_]", "_", file_prefix.strip().lower())

# caminho do RAW
path_raw = f"/Volumes/{catalog}/raw/raw_tbra"

print("Buscando arquivos em:", path_raw)

In [0]:
# ------------------------------------------------------------
# LISTAR SOMENTE XMLs no Volume
# ------------------------------------------------------------
files = dbutils.fs.ls(path_raw)

# Normalizar paths removendo dbfs:
all_files = [
    f.path.replace("dbfs:", "")
    for f in files
    if f.name.lower().endswith(".xml")
]

# Filtrar por prefixo
candidates = [
    p for p in all_files
    if p.lower().split("/")[-1].startswith(file_prefix_clean)
]

print("Arquivos encontrados:", all_files)
print("Candidatos:", candidates)

if not candidates:
    raise Exception(f"Nenhum XML com prefixo '{file_prefix_clean}' encontrado.")

In [0]:
def extrair_data(nome):
    match = re.search(r"(\d{8})", nome)
    if match:
        return datetime.strptime(match.group(1), "%Y%m%d")
    return datetime.min

# ordenar por data e pegar o mais novo
candidates_sorted = sorted(
    candidates,
    key=lambda p: extrair_data(p.split("/")[-1]),
    reverse=True
)

In [0]:
xml_path = candidates_sorted[0]             # <---- SOMENTE ESTE ARQUIVO
file_name = xml_path.split("/")[-1]

print(f"Arquivo selecionado (mais recente): {file_name}")

In [0]:
table_name = f"e_{file_prefix}"

# tabela completa no cat치logo:
bronze_table = f"{catalog}.{schema_out}.{table_name}"

print(f"Nome final da tabela Bronze: {bronze_table}")

In [0]:
try:
    df = (
        spark.read
            .format("com.databricks.spark.xml") 
            .option("rowTag", "gmd:MD_Metadata")
            .load(xml_path)
    )

    df = df.withColumn("file_name", F.lit(file_name))
    df = df.withColumn("data_ref_carga", F.lit(data_ref_carga))

    (
        df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .saveAsTable(bronze_table)
    )

    print(f"Bronze gravado com sucesso: {bronze_table}")

except Exception as e:
    print(f"Erro ao processar {file_name}: {e}")