In [0]:
%pip install kaggle pandas python-dotenv

In [0]:
display(dbutils.fs.ls("/Volumes/raw/data/configs_volume"))

In [0]:
with open("/Volumes/raw/data/configs_volume/config.json", "r") as f:
    print(f.read())

In [0]:
import datetime
import json
import os
import pandas as pd

# ==== 1. Credenciais Kaggle via Secret Scope ====
KAGGLE_USERNAME = dbutils.secrets.get(scope="kaggle_scope", key="KAGGLE_USERNAME")
KAGGLE_KEY = dbutils.secrets.get(scope="kaggle_scope", key="KAGGLE_KEY")

# Variáveis de ambiente para a API Kaggle
os.environ["KAGGLE_USERNAME"] = KAGGLE_USERNAME
os.environ["KAGGLE_KEY"] = KAGGLE_KEY

# ==== 2. Caminhos no S3/Unity Catalog ====
BASE_PATH = "/Volumes/raw/data"        # schema data
ACTUAL_PATH = f"{BASE_PATH}/actual"
LAST_PATH = f"{BASE_PATH}/last"

CDC_BASE_PATH = "/Volumes/raw/cdc"     # schema cdc (volumes por tabela)

# ==== 3. Config JSON (salvo também no volume) ====
CONFIG_PATH = "/Volumes/raw/data/configs_volume/config.json"
with open(CONFIG_PATH, "r") as f:
    CONFIG = json.load(f)

# ==== 4. Funções CDC ====
def get_update_lines(df_last, df_actual, pk, date_field):
    df_update = df_last.merge(
        df_actual,
        how="left",
        on=[pk],
        suffixes=('_x', '_y')
    )
    update_flag = df_update[date_field + '_y'] > df_update[date_field + '_x']
    ids_updated = df_update[update_flag][pk].tolist()
    df_update = df_actual[df_actual[pk].isin(ids_updated)].copy()
    df_update["op"] = "U"
    return df_update

def get_insert_lines(df_last, df_actual, pk):
    df_insert = df_actual[~df_actual[pk].isin(df_last[pk])].copy()
    df_insert["op"] = "I"
    return df_insert

def get_delete_lines(df_last, df_actual, pk):
    df_delete = df_last[~df_last[pk].isin(df_actual[pk])].copy()
    df_delete["op"] = "D"
    return df_delete

def create_cdc(df_last, df_actual, pk, date_field):
    df_update = get_update_lines(df_last, df_actual, pk, date_field)
    df_insert = get_insert_lines(df_last, df_actual, pk)
    df_delete = get_delete_lines(df_last, df_actual, pk)
    return pd.concat([df_update, df_insert, df_delete], ignore_index=True)

# ==== 5. Processamento CDC - Parquet por tabela com histórico ====
def process_cdc(tables):
    print("Processando CDC de todas as tabelas...")
    for t in tables:
        file_last = f"{LAST_PATH}/{t['name']}.csv"
        file_actual = f"{ACTUAL_PATH}/{t['name']}.csv"

        try:
            df_last = pd.read_csv(file_last, sep=t["sep"], on_bad_lines='skip')
        except Exception as e:
            print(f"[ERRO] Falha ao ler {file_last}: {e}")
            continue

        try:
            df_actual = pd.read_csv(file_actual, sep=t["sep"], on_bad_lines='skip')
        except Exception as e:
            print(f"[ERRO] Falha ao ler {file_actual}: {e}")
            continue

        df_cdc = create_cdc(df_last, df_actual, t["pk"], t["date_field"])

        if df_cdc.empty:
            print(f"Nenhuma alteração encontrada para {t['name']}.")
            continue

        # Caminho do volume específico da tabela CDC
        cdc_path = f"{CDC_BASE_PATH}/{t['name']}"
        dbutils.fs.mkdirs(cdc_path)

        # Timestamp para nome do arquivo
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        parquet_path = f"{cdc_path}/{now}.parquet"

        # Salva como Parquet (Spark necessário para gravar em UC/S3)
        df_spark = spark.createDataFrame(df_cdc)
        df_spark.write.mode("overwrite").parquet(parquet_path)

        print(f"[LOG] CDC de {t['name']} salvo em {parquet_path}")

    print("CDC processado com sucesso!")

# ==== 6. Baixar dados Kaggle ====
def download_kaggle_dataset(dataset_name):
    from kaggle.api.kaggle_api_extended import KaggleApi
    import tempfile
    import os

    # Cria diretório temporário local seguro no driver do cluster
    with tempfile.TemporaryDirectory() as tmpdir:
        api = KaggleApi()
        api.authenticate()

        print(f"Baixando dataset {dataset_name} para {tmpdir}...")
        api.dataset_download_files(dataset_name, path=tmpdir, unzip=True)
        print("Download concluído!")

        # Garante que o destino existe no volume S3
        dbutils.fs.mkdirs(ACTUAL_PATH)

        # Copia arquivos da pasta temporária local para o volume montado no S3
        for item in os.listdir(tmpdir):
            local_file = os.path.join(tmpdir, item)
            target_path = f"{ACTUAL_PATH}/{item}"
            dbutils.fs.cp(f"file:{local_file}", target_path)

        print(f"Arquivos movidos para {ACTUAL_PATH}")


# ==== 7. Mover actual -> last ====
def move_from_actual_to_last():
    dbutils.fs.mkdirs(LAST_PATH)
    print(f"Movendo arquivos de {ACTUAL_PATH} para {LAST_PATH}...")
    files = dbutils.fs.ls(ACTUAL_PATH)
    for f in files:
        dbutils.fs.mv(f.path, f"{LAST_PATH}/{os.path.basename(f.path)}", True)
    print("Arquivos movidos com sucesso!")

# ==== 8. Main ====
def main():
    dataset_name = CONFIG["dataset_name"]
    move_from_actual_to_last()
    download_kaggle_dataset(dataset_name)
    process_cdc(CONFIG["tables"])

main()


In [0]:
dbutils.fs.mkdirs("/tmp")
