In [None]:
# ==========================================
# 1. Importações
# ==========================================

import pandas as pd
from pyiceberg.types import (
    StringType,
    LongType,
    DoubleType,
    TimestampType,
    NestedField,
)

from config_setup import configure_iceberg
from s3_iceberg_connections import setup_connections
from data_processor import read_raw_csv_from_s3, apply_data_cleaning_and_typing
from iceberg_table_manager import (
    create_iceberg_schema,
    manage_table_lifecycle,
)
from iceberg_writer import write_dataframe_to_iceberg

In [None]:
# ==========================================
# 2. Variáveis do Job
# ==========================================

RAW_BUCKET = "raw"          # Lê da camada RAW
ICEBERG_BUCKET = "pyiceberg"  # Escreve no bucket Iceberg

CSV_PREFIX = "dataset.csv"
DB_NAME = "sales"
TABLE_NAME = "pedidos"

In [None]:
# ==========================================
# 3. Função de limpeza
# ==========================================

def clean_and_cast_pedido_data(df: pd.DataFrame) -> pd.DataFrame:
    """Limpeza e padronização dos dados de pedidos."""
    df = df.rename(columns={"Product ID": "Product_ID"})

    numeric_cols = ["Total_Vendas", "Desconto", "Lucro"]
    for col in numeric_cols:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(",", ".", regex=False)
            .astype(float)
        )

    df["Quantidade"] = (
        df["Quantidade"]
        .astype(str)
        .str.replace(",", ".", regex=False)
        .astype(float)
        .astype("int64")
    )

    df["Data_Pedido"] = (
        pd.to_datetime(df["Data_Pedido"], format="%d-%m-%Y")
        .astype("datetime64[us]")
    )

    return df

In [None]:
# ==========================================
# 4. Schema Iceberg
# ==========================================

pedido_fields = [
    NestedField(1, "ID_Pedido", StringType(), required=False),
    NestedField(2, "Data_Pedido", TimestampType(), required=False),
    NestedField(3, "ID_Cliente", StringType(), required=False),
    NestedField(4, "Segmento", StringType(), required=False),
    NestedField(5, "Regiao", StringType(), required=False),
    NestedField(6, "Pais", StringType(), required=False),
    NestedField(7, "Product_ID", StringType(), required=False),
    NestedField(8, "Categoria", StringType(), required=False),
    NestedField(9, "SubCategoria", StringType(), required=False),
    NestedField(10, "Total_Vendas", DoubleType(), required=False),
    NestedField(11, "Quantidade", LongType(), required=False),
    NestedField(12, "Desconto", DoubleType(), required=False),
    NestedField(13, "Lucro", DoubleType(), required=False),
    NestedField(14, "Prioridade", StringType(), required=False),
]

In [None]:
# ==========================================
# 5. Orquestração do Pipeline
# ==========================================

# A. Setup e conexões
catalog_properties = configure_iceberg()
s3_client, catalog = setup_connections(catalog_properties)
print("Conexões e ambiente configurados.")

# B. Leitura da RAW + processamento
df_raw = read_raw_csv_from_s3(s3_client, RAW_BUCKET, CSV_PREFIX)
df_final = apply_data_cleaning_and_typing(df_raw, clean_and_cast_pedido_data)
print(f"DataFrame processado. Linhas: {len(df_final)}")

In [None]:
# C. Criar / carregar tabela Iceberg
iceberg_schema = create_iceberg_schema(pedido_fields)

table = manage_table_lifecycle(
    catalog=catalog,
    db_name=DB_NAME,
    table_name=TABLE_NAME,
    bucket=ICEBERG_BUCKET,
    schema=iceberg_schema,
)