In [0]:
# import delta
# def table_exists(catalog, database, table):
#     count = (
#         spark.sql(f"SHOW TABLES FROM {catalog}.{database}")
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count == 1

In [0]:
# Outra abordagem para a função table_exists utilizzando SHOW TABLES IN
# import delta
# def table_exists(catalog, database, table):
#     df_tables = spark.sql(f"SHOW TABLES IN {catalog}.{database}")
#     # Usando aspas no filtro para evitar erro de parsing
#     count = (
#         df_tables
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count > 0

# Vantagem: não precisa tentar acessar a tabela; funciona rápido se o catálogo já está populado.
# Observação: se o database na listagem não vier preenchido (isso acontece dependendo da versão/configuração), o filtro pode falhar.


In [0]:
# Outra abordagem para a função table_exists usando DESCRIBE TABLE com Try/Except (mais robusto)

import delta
def table_exists(catalog, database, table):
    try:
        spark.sql(f"DESCRIBE TABLE {catalog}.{database}.{table}")
        return True
    except Exception:
        return False

# Vantagem: evita depender do formato do SHOW TABLES e do preenchimento da coluna database.
# Funciona mesmo se a tabela for externa, view ou Delta table no Unity Catalog.

In [0]:
catalog = 'bronze'
schema = 'upsell'
# tablename = dbutils.widgets.get('tablename')
# id_field = dbutils.widgets.get('id_field')
# timestamp_field = dbutils.widgets.get('timestamp_field')
tablename = 'customers'
id_field = 'IdCliente'
timestamp_field = 'DtAtualizacao'

In [0]:
if not table_exists(catalog, schema, tablename):

    print('Tabela não existe, criando...')

    df_full = spark.read.format('parquet').load(f'/Volumes/raw/full_load/{tablename}/')
    
    (df_full.coalesce(1)
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(f"{catalog}.{schema}.{tablename}"))

else:
    print('Tabela já existe, ignorando full-load')

In [0]:
# Capturando o schema da tabela
df_full = spark.read.format('parquet').load(f'/Volumes/raw/data/cdc/{tablename}/')
schema = df_full.schema

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

bronze = delta.DeltaTable.forName(spark, f'{catalog}.{schema}.{tablename}')

def upsert(df,deltatable):      
      # Define a janela (window) para row_number
      window_spec = Window.partitionBy(id_field).orderBy(F.col(timestamp_field).desc())

      # Adiciona o row_number
      df_cdc = df.withColumn("row_num", F.row_number().over(window_spec)) \
                  .filter(F.col("row_num") == 1) \
                  .drop("row_num")

      deltatable.alias('b') \
            .merge(df_cdc.alias('d'), f'b.{id_field} = d.{id_field}') \
            .whenMatchedDelete(condition="d.op = 'D'") \
            .whenMatchedUpdateAll(condition="d.op = 'U'") \
            .whenNotMatchedInsertAll(condition="d.op = 'I' OR d.op = 'U'") \
            .execute()

df_stream = spark.readStream \
      .format("cloudFiles") \
      .option("cloudFiles.format", "parquet") \
      .schema(schema) \
      .load(f"/Volumes/raw/data/cdc/{tablename}/")

stream = df_stream.writeStream \
      .option("checkpointLocation", f"/Volumes/raw/data/cdc/{tablename}_checkpoints") \
      .foreachBatch(lambda df, BatchId: upsert(df,bronze)) \
      .trigger(availableNow=True)

In [0]:
# Opção sugerida pelo Assistant do Databricks:
# You need to specify the cloudFiles.schemaLocation option to enable schema inference and evolution when using Auto Loader. This option points to a directory where Databricks will store schema information for your stream. Here is the fixed code:

# df_stream = (
#     spark.readStream
#     .format("cloudFiles")
#     .option("cloudFiles.format", "parquet")
#     .option("cloudFiles.schemaLocation", f"/Volumes/raw/data/cdc/{tablename}/_schemas")
#     .load(f"/Volumes/raw/data/cdc/{tablename}/")
# )

# This code adds the required schemaLocation option. The directory specified will track schema changes over time for your streaming source.

In [0]:
start = stream.start()