#Processamento de Raw XML to bronze

In [0]:
# ================================================================
# Bronze - Importar XML (IBGE)
# ================================================================
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime
import xml.etree.ElementTree as ET
import re


In [0]:
spark = SparkSession.builder.appName("Processamento_raw_xml_to_bronze").getOrCreate()

In [0]:
# ================================================================
# Parâmetro recebido via Job Databricks
# ================================================================
# ---------- widgets (params)
# ---------- widgets
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("file_prefix", "")
dbutils.widgets.text("data_ref_carga", "")

catalog        = dbutils.widgets.get("catalog")
schema_out     = dbutils.widgets.get("schema")
file_prefix    = dbutils.widgets.get("file_prefix")
data_ref_carga = dbutils.widgets.get("data_ref_carga")

if not (catalog and schema_out and file_prefix and data_ref_carga):
    raise ValueError("Parâmetros obrigatórios ausentes.")

file_prefix_clean = re.sub(r"[^0-9a-zA-Z_]", "_", file_prefix.strip().lower())

# caminho do RAW
path_raw = f"/Volumes/{catalog}/raw/raw_tbra"

print("Buscando arquivos em:", path_raw)

In [0]:
# ------------------------------------------------------------
# LISTAR SOMENTE XMLs no Volume
# ------------------------------------------------------------
files = dbutils.fs.ls(path_raw)

# Volumes sempre retornam path com dbfs://, removeremos
all_files = [
    f.path.replace("dbfs:", "")
    for f in files
    if f.name.lower().endswith(".xml")
]

# filtrar por prefixo
candidates = [
    p for p in all_files
    if p.lower().split("/")[-1].startswith(file_prefix_clean)
]

print("Arquivos encontrados:", all_files)
print("Candidatos:", candidates)

if not candidates:
    raise Exception(f"Nenhum XML com prefixo '{file_prefix_clean}' encontrado.")

In [0]:
# ------------------------------------------------------------
# PROCESSAR TODOS OS XMLs CANDIDATOS
# ------------------------------------------------------------
for xml_path in candidates:

    file_name = xml_path.split("/")[-1]

    # tabela bronze sempre com prefixo e_
    table_name = f"e_{file_prefix_clean}"
    bronze_table = f"{catalog}.{schema_out}.{table_name}"

    print(f"Processando arquivo {file_name}")
    print("Path final utilizado no load():", xml_path)

In [0]:
try:
    # ----------------------------------------------------
    # LER XML USANDO spark-xml (agora funciona)
    # ----------------------------------------------------
    df = (
        spark.read
            .format("xml")
            .option("rowTag", "gmd:MD_Metadata")   # rowTag correto
            .load(xml_path)
    )

    # adicionar metadata
    df = df.withColumn("file_name", F.lit(file_name))
    df = df.withColumn("data_ref_carga", F.lit(data_ref_carga))

    # ----------------------------------------------------
    # GRAVAR BRONZE
    # ----------------------------------------------------
    (
        df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .option("replaceWhere", f"data_ref_carga = '{data_ref_carga}'")
            .partitionBy("data_ref_carga")
            .saveAsTable(bronze_table)
    )

    print(f"Bronze gravado com sucesso: {bronze_table}")

except Exception as e:
    print(f"Erro ao processar {file_name}: {e}")

In [0]:
# write technical log (append)
log_schema = ["arquivo","tabela","row_tag","status","erro_msg","data_log","data_ref_carga","qtd_registros","duracao_segundos"]
df_log = spark.createDataFrame(logs, schema=log_schema)

# ensure log table exists (create if not)
log_table = f"{catalog_name}.{schema_out}.log_carga_xml"
try:
    # append
    df_log.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(log_table)
except Exception as e:
    print("Falha ao gravar log:", e)

print("Processamento BRONZE finalizado.")
dbutils.notebook.exit("SUCCESS")