In [0]:
# process_silver_xml (SILVER)
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType

In [0]:
# ================================================================
# 1️⃣ PARÂMETROS DO JOB
# ================================================================
dbutils.widgets.text("catalog", "datamasters")
dbutils.widgets.text("data_ref_carga", "")
dbutils.widgets.text("file_prefix", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("schema_out", "")
dbutils.widgets.text("table", "")
dbutils.widgets.text("table_out", "")
dbutils.widgets.text("uuid_metadata", "")

catalog        = dbutils.widgets.get("catalog")
data_ref_carga = dbutils.widgets.get("data_ref_carga")
file_prefix    = dbutils.widgets.get("file_prefix")
schema         = dbutils.widgets.get("schema")
schema_out     = dbutils.widgets.get("schema_out")
table          = dbutils.widgets.get("table")
table_out      = dbutils.widgets.get("table_out")
uuid_metadata  = dbutils.widgets.get("uuid_metadata")

if not (catalog and schema and schema_out and table and table_out and data_ref_carga):
    raise ValueError("Parametros obrigatórios ausentes.")

print("✔ Parâmetros carregados:")
print("catalog       :", catalog)
print("schema        :", schema)
print("schema_out    :", schema_out)
print("file_prefix   :", file_prefix)
print("table         :", table)
print("table_out     :", table_out)
print("data_ref_carga:", data_ref_carga)
print("uuid_metadata :", uuid_metadata)

In [0]:
# ================================================================
# 2️⃣ DEFINIÇÃO DAS TABELAS BRONZE E SILVER
# ================================================================
bronze_table = f"{catalog}.{schema}.e_{file_prefix}"
silver_table = f"{catalog}.{schema_out}.e_{table_out}"

print("Tabela Bronze:", bronze_table)
print("Tabela Silver:", silver_table)

In [0]:
# ================================================================
# 3️⃣ LEITURA DA BRONZE
# ================================================================
df_bronze = spark.table(bronze_table)
df_bronze.printSchema()
print("Registros Bronze:", df_bronze.count())

In [0]:
# ================================================================
# 2) Extrair o primeiro elemento do array gmd:extent com segurança
#    - usamos getItem(0) (equivalente ao element_at(...,1))
#    - todos os nomes com ":" são referenciados com crases/backticks
# ================================================================
extent_col = F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:extent`")
df = df_bronze.withColumn("extent0", extent_col.getItem(0))

In [0]:

# ================================================================
# 3) Extrair EX_Extent e EX_GeographicBoundingBox
# ================================================================
# EX_Extent (pode ser struct dentro de extent0)
df = df.withColumn("ex_extent", F.col("extent0.`gmd:EX_Extent`"))

# geographicElement (no seu schema geographicElement é struct - não array)
# então acessamos diretamente o campo
df = df.withColumn("geo_element", F.col("ex_extent.`gmd:geographicElement`"))

# EX_GeographicBoundingBox
df = df.withColumn("geobox", F.col("geo_element.`gmd:EX_GeographicBoundingBox`"))

In [0]:
# ================================================================
# 4) Extrair valores numéricos do bbox (gco:Decimal)
# ================================================================
df = df.withColumn(
    "eastBoundLongitude",
    F.col("geobox.`gmd:eastBoundLongitude`.`gco:Decimal`").cast("double")
).withColumn(
    "westBoundLongitude",
    F.col("geobox.`gmd:westBoundLongitude`.`gco:Decimal`").cast("double")
).withColumn(
    "northBoundLatitude",
    F.col("geobox.`gmd:northBoundLatitude`.`gco:Decimal`").cast("double")
).withColumn(
    "southBoundLatitude",
    F.col("geobox.`gmd:southBoundLatitude`.`gco:Decimal`").cast("double")
)

In [0]:
# ================================================================
# 5) Extrair outros campos úteis (title, abstract, publication_date, graphic, license, emails, keywords)
# Usamos sempre backticks para caminhos com ":" e element_at/getItem quando necessário.
# ================================================================
df = df.withColumn(
    "title",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:citation`"
          ".`gmd:CI_Citation`.`gmd:title`.`gco:CharacterString`")
).withColumn(
    "abstract",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:abstract`.`gco:CharacterString`")
).withColumn(
    "publication_date",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:citation`"
          ".`gmd:CI_Citation`.`gmd:date`.`gmd:CI_Date`.`gmd:date`.`gco:Date`")
).withColumn(
    "graphic_file",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:graphicOverview`"
          ".`gmd:MD_BrowseGraphic`.`gmd:fileName`.`gco:CharacterString`")
).withColumn(
    "graphic_description",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:graphicOverview`"
          ".`gmd:MD_BrowseGraphic`.`gmd:fileDescription`.`gco:CharacterString`")
).withColumn(
    "license_info",
    F.col("`gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:resourceConstraints`"
          ".`gmd:MD_LegalConstraints`.`gmd:otherConstraints`.`gco:CharacterString`")
)

# Emails: pointOfContact is an array -> iterate and pull electronicMailAddress
df = df.withColumn(
    "emails",
    F.expr("""
        transform(
            `gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:pointOfContact`,
            x -> x.`gmd:CI_ResponsibleParty`.`gmd:contactInfo`.`gmd:CI_Contact`
                 .`gmd:address`.`gmd:CI_Address`.`gmd:electronicMailAddress`.`gco:CharacterString`
        )
    """)
)

# Keywords: flatten nested arrays -> result may be array<string>
df = df.withColumn(
    "keywords",
    F.expr("""
        flatten(
            transform(
                `gmd:identificationInfo`.`gmd:MD_DataIdentification`.`gmd:descriptiveKeywords`,
                x -> transform(
                    x.`gmd:MD_Keywords`.`gmd:keyword`,
                    y -> y.`gco:CharacterString`
                )
            )
        )
    """)
)


In [0]:
# ================================================================
# 6) Preparar dataframe final da SILVER com campos padronizados
# ================================================================
df_silver = df.select(
    F.lit(uuid_metadata).alias("uuid_metadata"),
    F.lit(data_ref_carga).alias("data_ref_carga"),
    F.current_timestamp().alias("ingestion_ts"),
    "file_name",
    "title",
    "abstract",
    "publication_date",
    "eastBoundLongitude",
    "westBoundLongitude",
    "northBoundLatitude",
    "southBoundLatitude",
    "graphic_file",
    "graphic_description",
    "license_info",
    "emails",
    "keywords"
)

# opcional: renomear para colunas sem ":" e deixar tudo limpo (recomendado)
df_silver = (
    df_silver
    .withColumnRenamed("file_name", "file_name")
)

# mostrar resultado para debug
df_silver.printSchema()
df_silver.show(truncate=False)

In [0]:
# ================================================================
# 7) Gravar SILVER (overwrite total) - particionar opcional
# ================================================================
target_silver = silver_table

(
    df_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .partitionBy("data_ref_carga")
        .saveAsTable(target_silver)
)

print("Silver gravado em:", target_silver)
dbutils.notebook.exit("SUCCESS")