In [0]:
# ================================================================
# Silver - Transformar XML Limites dos Biomas (IBGE)
# ================================================================
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Silver_LimitesBiomas_XML").getOrCreate()

In [0]:
# ================================================================
# Par√¢metro recebido via Databricks Job
# ================================================================
dbutils.widgets.text("data_ref_carga", "")
data_ref_carga = dbutils.widgets.get("data_ref_carga")

if not data_ref_carga:
    raise ValueError("‚ùå Par√¢metro 'data_ref_carga' n√£o informado (formato esperado: yyyy-MM-dd)")

print(f"üóìÔ∏è Data de refer√™ncia da carga solicitada: {data_ref_carga}")

catalog = "amazonia_catalog"
schema = "b_tbra"
table_bronze = f"{catalog}.{schema}.lim_bio"
table_silver = f"{catalog}.{schema}.lim_bio_silver"

In [0]:
# ================================================================
# 1Ô∏è‚É£ Verificar se parti√ß√£o existe na Bronze
# ================================================================
particoes = (
    spark.sql(f"SELECT DISTINCT data_ref_carga FROM {table_bronze}")
    .orderBy(F.col("data_ref_carga").desc())
    .collect()
)

if not particoes:
    raise RuntimeError(f"‚ùå Nenhuma parti√ß√£o encontrada na Bronze: {table_bronze}")

particoes_disponiveis = [r["data_ref_carga"] for r in particoes]
print("üìÖ Parti√ß√µes dispon√≠veis:", particoes_disponiveis)

if data_ref_carga in particoes_disponiveis:
    data_usada = data_ref_carga
    print(f"‚úÖ Usando parti√ß√£o solicitada: {data_usada}")
else:
    data_usada = particoes_disponiveis[0]
    print(f"‚ö†Ô∏è Parti√ß√£o {data_ref_carga} n√£o encontrada, usando a √∫ltima dispon√≠vel: {data_usada}")

In [0]:
# ================================================================
# 2Ô∏è‚É£ Ler Bronze filtrando pela parti√ß√£o escolhida
# ================================================================
df_bronze = spark.table(table_bronze).filter(F.col("data_ref_carga") == data_usada)

print(f"‚úÖ Bronze carregado com {df_bronze.count()} registros da parti√ß√£o {data_usada}")

In [0]:
# ================================================================
# 3Ô∏è‚É£ Extra√ß√£o de campos relevantes
# ================================================================
cols_to_select = {
    "title": "`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:citation`.`gmd:CI_Citation`.`gmd:title`.`gco:CharacterString`",
    "abstract": "`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:abstract`.`gco:CharacterString`",
    "purpose": "`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:purpose`.`gco:CharacterString`",
    "reference_system": "`gmd:referenceSystemInfo`.`gmd:MD_ReferenceSystem`.`gmd:referenceSystemIdentifier`.`gmd:RS_Identifier`.`gmd:code`.`gco:CharacterString`",
    "date_stamp": "`gmd:dateStamp`.`gco:DateTime`",
}

select_exprs = [F.col(v).alias(k) for k, v in cols_to_select.items()]
df_silver = df_bronze.select(*select_exprs)
df_silver = (
    df_silver
    .withColumn("data_ref_carga", F.lit(data_usada))
    .withColumn("title", F.trim("title"))
    .withColumn("abstract", F.regexp_replace("abstract", "\\s+", " "))
    .withColumn("date_stamp", F.to_timestamp("date_stamp"))
)

In [0]:
# ================================================================
# 4Ô∏è‚É£ Gravar tabela Silver (particionada)
# ================================================================
(
    df_silver.write
    .format("delta")
    .mode("append")
    .partitionBy("data_ref_carga")
    .saveAsTable(table_silver)
)

print(f"‚úÖ Silver atualizada: {table_silver} (parti√ß√£o {data_usada})")

In [0]:
# ================================================================
# 5Ô∏è‚É£ Visualiza√ß√£o r√°pida
# ================================================================
display(df_silver.limit(5))