In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
# import delta
# def table_exists(catalog, database, table):
#     count = (
#         spark.sql(f"SHOW TABLES FROM {catalog}.{database}")
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count == 1

In [0]:
# Outra abordagem para a função table_exists utilizando SHOW TABLES IN
# import delta
# def table_exists(catalog, database, table):
#     df_tables = spark.sql(f"SHOW TABLES IN {catalog}.{database}")
#     # Usando aspas no filtro para evitar erro de parsing
#     count = (
#         df_tables
#         .filter(f"database = '{database}' AND tableName = '{table}'")
#         .count()
#     )
#     return count > 0

# Vantagem: não precisa tentar acessar a tabela; funciona rápido se o catálogo já está populado.
# Observação: se o database na listagem não vier preenchido (isso acontece dependendo da versão/configuração), o filtro pode falhar.


In [0]:
import delta
import sys

sys.path.insert(0, '../lib')

import utils
import ingestors
from pyspark.sql.functions import col, to_timestamp

In [0]:

catalog = 'bronze'
schemaname = 'upsell'
tablename = dbutils.widgets.get('tablename')
id_field = dbutils.widgets.get('id_field')
timestamp_field = dbutils.widgets.get('timestamp_field')

full_load_path = f"/Volumes/raw/{schemaname}/full_load/{tablename}/"
cdc_path = f"/Volumes/raw/{schemaname}/cdc/{tablename}/"
checkpoint_location = f"/Volumes/raw/{schemaname}/cdc/{tablename}_checkpoints"

In [0]:
if not utils.table_exists(spark, catalog, schemaname, tablename):

    print('Tabela não existe, criando...')

    dbutils.fs.rm(checkpoint_location, True)
    
    ingest_full_load = ingestors.ingestor(spark=spark, 
                                        catalog=catalog,
                                        schemaname=schemaname, 
                                        tablename=tablename, 
                                        data_format='parquet')
    ingest_full_load.execute(full_load_path)

    print(f'Criando tabela {tablename} em {catalog}.{schemaname}. Tabela criada com sucesso!')

else:
    print(f'Tabela {tablename} já existente em {catalog}.{schemaname}, ignorando full-load')

In [0]:
ingest_cdc = ingestors.ingestorCDC(spark=spark,
                                  catalog=catalog,
                                  schemaname=schemaname, 
                                  tablename=tablename, 
                                  data_format='parquet',
                                  id_field=id_field,
                                  timestamp_field=timestamp_field)

stream = ingest_cdc.execute(cdc_path)

In [0]:
# from pyspark.sql.window import Window
# from pyspark.sql import functions as F
# data_schema = utils.import_schema(tablename)
# bronze = delta.DeltaTable.forName(spark, f'{catalog}.{schemaname}.{tablename}')

# def upsert(df, table_name):
#     df.createOrReplaceTempView("cdc_temp_view")

#     merge_sql = f"""
#     MERGE INTO {tablename} AS b
#     USING (
#         SELECT * FROM cdc_temp_view
#         QUALIFY row_number() OVER (PARTITION BY {id_field} ORDER BY {timestamp_field} DESC) = 1
#     ) AS d
#     ON b.{id_field} = d.{id_field}
#     WHEN MATCHED AND d.op = 'D' THEN DELETE
#     WHEN MATCHED AND d.op = 'U' THEN UPDATE SET *
#     WHEN NOT MATCHED AND (d.op = 'I' OR d.op = 'U') THEN INSERT *
#     """

#     spark.sql(merge_sql)


# df_stream = spark.readStream \
#       .format("cloudFiles") \
#       .option("cloudFiles.format", "parquet") \
#       .schema(data_schema) \
#       .load(f"/Volumes/raw/data/cdc/{tablename}/")

# stream = df_stream.writeStream \
#       .option("checkpointLocation", f"/Volumes/raw/data/cdc/{tablename}_checkpoints") \
#       .foreachBatch(lambda df, BatchId: upsert(df, f'{catalog}.{schemaname}.{tablename}')) \
#       .trigger(availableNow=True)