In [0]:
# import delta
# def table_exists(catalog, database, table):
#     count = (
#         spark.sql(f"SHOW TABLES FROM {catalog}.{database}")
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count == 1

In [0]:
# Outra abordagem para a função table_exists utilizzando SHOW TABLES IN
# import delta
# def table_exists(catalog, database, table):
#     df_tables = spark.sql(f"SHOW TABLES IN {catalog}.{database}")
#     # Usando aspas no filtro para evitar erro de parsing
#     count = (
#         df_tables
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count > 0

# Vantagem: não precisa tentar acessar a tabela; funciona rápido se o catálogo já está populado.
# Observação: se o database na listagem não vier preenchido (isso acontece dependendo da versão/configuração), o filtro pode falhar.


In [0]:
import delta
import sys

sys.path.insert(0, '../lib/')

import utils

In [0]:
catalog = 'bronze'
schema = 'upsell'
# tablename = dbutils.widgets.get('tablename')
# id_field = dbutils.widgets.get('id_field')
# timestamp_field = dbutils.widgets.get('timestamp_field')
tablename = 'transactions'
id_field = 'IdTransacao'
timestamp_field = 'DtCriacao'
df_schema = utils.import_schema(tablename)

In [0]:
if not utils.table_exists(spark, catalog, schema, tablename):

    print('Tabela não existe, criando...')

    df_full = spark.read.format('parquet').load(f'/Volumes/raw/full_load/{tablename}/')
    
    (df_full.coalesce(1)
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(f"{catalog}.{schema}.{tablename}"))

else:
    print('Tabela já existe, ignorando full-load')

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

bronze = delta.DeltaTable.forName(spark, f'{catalog}.{schema}.{tablename}')

def upsert(df, table_name):
    df.createOrReplaceTempView("cdc_temp_view")

    merge_sql = f"""
    MERGE INTO {table_name} AS b
    USING (
        SELECT * FROM cdc_temp_view
        QUALIFY row_number() OVER (PARTITION BY {id_field} ORDER BY {timestamp_field} DESC) = 1
    ) AS d
    ON b.{id_field} = d.{id_field}
    WHEN MATCHED AND d.op = 'D' THEN DELETE
    WHEN MATCHED AND d.op = 'U' THEN UPDATE SET *
    WHEN NOT MATCHED AND (d.op = 'I' OR d.op = 'U') THEN INSERT *
    """

    spark.sql(merge_sql)


df_stream = spark.readStream \
      .format("cloudFiles") \
      .option("cloudFiles.format", "parquet") \
      .schema(df_schema) \
      .load(f"/Volumes/raw/data/cdc/{tablename}/")

stream = df_stream.writeStream \
      .option("checkpointLocation", f"/Volumes/raw/data/cdc/{tablename}_checkpoints") \
      .foreachBatch(lambda df, BatchId: upsert(df, f'{catalog}.{schema}.{tablename}')) \
      .trigger(availableNow=True)

In [0]:
# Opção sugerida pelo Assistant do Databricks:
# You need to specify the cloudFiles.schemaLocation option to enable schema inference and evolution when using Auto Loader. This option points to a directory where Databricks will store schema information for your stream. Here is the fixed code:

# df_stream = (
#     spark.readStream
#     .format("cloudFiles")
#     .option("cloudFiles.format", "parquet")
#     .option("cloudFiles.schemaLocation", f"/Volumes/raw/data/cdc/{tablename}/_schemas")
#     .load(f"/Volumes/raw/data/cdc/{tablename}/")
# )

# This code adds the required schemaLocation option. The directory specified will track schema changes over time for your streaming source.

In [0]:
start = stream.start()

In [0]:
start.stop()