In [0]:
# process_silver_xml (SILVER)
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType

In [0]:
# ================================================================
# Parâmetro recebido via Databricks Job
# ================================================================
# ---------- widgets
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("schema_out", "")
dbutils.widgets.text("table", "")
dbutils.widgets.text("table_out", "")
dbutils.widgets.text("data_ref_carga", "")

catalog         = dbutils.widgets.get("catalog").strip()
schema          = dbutils.widgets.get("schema").strip()
schema_out      = dbutils.widgets.get("schema_out").strip()
table_param     = dbutils.widgets.get("table").strip()        # vem "prodes_brasil"
table           = f"e_{table_param}"                           # vira "e_prodes_brasil"
table_out       = dbutils.widgets.get("table_out").strip()
data_ref_carga  = dbutils.widgets.get("data_ref_carga").strip()

if not (catalog and schema and schema_out and table and table_out and data_ref_carga):
    raise ValueError("Parametros obrigatórios: catalog, schema, schema_out, table, table_out, data_ref_carga")

In [0]:
# Ler a tabela bronze
table_fullname = f"{catalog}.{schema}.{table}"
print("Lendo Bronze:", table_fullname)

df_bronze = spark.read.table(table_fullname)

In [0]:
# ============================================================
# 2️⃣ Helper: extrair o primeiro bounding box não-nulo
# ============================================================
def get_geographic_bbox(field):
    return F.expr(f"""
        filter(
            transform(
                {field},
                x -> x.EX_Extent.geographicElement.EX_GeographicBoundingBox
            ),
            y -> y is not null
        )[0]
    """)

In [0]:
# ============================================================
# 3️⃣ Criar coluna bbox antes de fazer o select final
# ============================================================
df_temp = df_bronze.withColumn("bbox", get_geographic_bbox("identificationInfo.MD_DataIdentification.extent"))

In [0]:
df_silver = df_temp.select(
    "data_ref_carga",
    F.col("identificationInfo.MD_DataIdentification.abstract.CharacterString").alias("abstract"),
    F.col("identificationInfo.MD_DataIdentification.purpose.CharacterString").alias("purpose"),
    F.col("identificationInfo.MD_DataIdentification.citation.CI_Citation.title.CharacterString").alias("title"),
    F.col("identificationInfo.MD_DataIdentification.citation.CI_Citation.date.CI_Date.date.Date").alias("publication_date"),
    F.col("bbox.eastBoundLongitude.Decimal").alias("eastBoundLongitude"),
    F.col("bbox.westBoundLongitude.Decimal").alias("westBoundLongitude"),
    F.col("bbox.northBoundLatitude.Decimal").alias("northBoundLatitude"),
    F.col("bbox.southBoundLatitude.Decimal").alias("southBoundLatitude"),
    F.col("identificationInfo.MD_DataIdentification.graphicOverview.MD_BrowseGraphic.fileName.CharacterString").alias("graphic_file"),
    F.col("identificationInfo.MD_DataIdentification.graphicOverview.MD_BrowseGraphic.fileDescription.CharacterString").alias("graphic_description"),
    F.col("identificationInfo.MD_DataIdentification.resourceConstraints.MD_LegalConstraints.otherConstraints.CharacterString").alias("license_info"),
    F.expr("""
        transform(
            identificationInfo.MD_DataIdentification.pointOfContact,
            x -> x.CI_ResponsibleParty.contactInfo.CI_Contact.address.CI_Address.electronicMailAddress.CharacterString
        )
    """).alias("emails"),
    F.expr("""
        flatten(
            transform(
                identificationInfo.MD_DataIdentification.descriptiveKeywords,
                x -> transform(x.MD_Keywords.keyword, y -> y.CharacterString)
            )
        )
    """).alias("keywords")
)

In [0]:
# Gravar tabela Silver Delta particionada por data_ref_carga
# write silver with replaceWhere by partition
target_silver = f"{catalog}.{schema_out}.{table_out}"
(
    df_silver.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("replaceWhere", f"data_ref_carga = '{data_ref_carga}'")
    .partitionBy("data_ref_carga")
    .saveAsTable(target_silver)
)

print("Silver gravado em:", target_silver)
dbutils.notebook.exit("SUCCESS")