## Bronze Layer

#### Criei um volume delta já que no free tier o acesso a mnt/ é negado

#### Verificando os dados de teste disponíveis

In [0]:
%py
# Definir caminhos base
# BRONZE_PATH = "/mnt/delta/bronze"
BRONZE_PATH = "/Volumes/workspace/default/delta/bronze/retail"
SILVER_PATH = "/Volumes/workspace/default/delta/silver/retail"


# Criando diretório Silver
# Definir caminhos base


In [0]:
display(dbutils.fs.ls(BRONZE_PATH))


### Dataset retail silver


In [0]:
silver_map = {
    "tmp_customers": f"{BRONZE_PATH}/customers",
    "tmp_company_employees": f"{BRONZE_PATH}/company_employees",
    "tmp_active_promotions": f"{BRONZE_PATH}/active_promotions",
    "tmp_loyalty_segment": f"{BRONZE_PATH}/loyalty_segment",
    "tmp_products": f"{BRONZE_PATH}/products",
    "tmp_promotions": f"{BRONZE_PATH}/promotions",
    "tmp_purchase_orders": f"{BRONZE_PATH}/purchase_orders",
    "tmp_sales_orders": f"{BRONZE_PATH}/sales_orders",
    "tmp_sales_stream": f"{BRONZE_PATH}/sales_stream",
    "tmp_suppliers": f"{BRONZE_PATH}/suppliers"
}

for view_name, path in silver_map.items():
    print(f"Loading {view_name} from {path}")
    spark.read.format("delta").load(path).createOrReplaceTempView(view_name)
# DBTITLE 1,customers

In [0]:
%sql
SELECT * FROM tmp_loyalty_segment;

### Lendo as tabelas

In [0]:
%sql
SELECT * FROM tmp_customers as tc
LEFT JOIN tmp_loyalty_segment as tls
ON tc.loyalty_segment = tls.loyalty_segment_id
;

In [0]:
'https://nominatim.openstreetmap.org/reverse?lat=-23.55052&lon=-46.633308&format=json'

In [0]:
from pyspark.sql import functions as F

# Supondo que seu DataFrame bronze é df_bronze
df_silver = (
    _sqldf 
    # remove .0 e espaços desnecessários
    .withColumn("ship_to_address_clean",
        F.trim(F.regexp_replace("ship_to_address", r"\.0", ""))
    )
    # quebra o texto em até 4 partes (por vírgula)
    .withColumn("address_parts", F.split(F.col("ship_to_address_clean"), ",\\s*"))
    # cria as colunas separadas
    .withColumn("state_ship", F.col("address_parts")[0])
    .withColumn("postcode_ship", F.col("address_parts")[1])
    .withColumn("street_ship", F.col("address_parts")[2])
    .withColumn("number_ship", F.col("address_parts")[3])
    # opcional: limpa “nan”, “null” e zeros falsos
    # 1️⃣ Limpa “nan”, “null”, “None”, “0” e caracteres estranhos
    .withColumn("state_ship", F.when(F.col("state_ship").isin("nan", "null", "None", "0"), "").otherwise(F.col("state_ship")))
    .withColumn("postcode_ship", F.regexp_replace("postcode_ship", r"[^0-9]", ""))
    .withColumn("street_ship", F.when(F.col("street_ship").isin("nan", "null", "None", "0"), "").otherwise(F.col("street_ship")))
    .withColumn("number_ship", F.regexp_replace("number_ship", r"[^A-Za-z0-9\- ]", ""))
    .withColumn("number_ship", F.trim(F.regexp_replace("number_ship", r"\.0", "")))

    # 2️⃣ Preenche com valores originais quando o ship_to_address estiver incompleto
    .withColumn(
        "state_ship",
        F.when(
            (F.col("state_ship").isNull()) | 
            (F.trim(F.col("state_ship")) == "") |
            (F.lower(F.col("state_ship")).isin("nan", "null", "none", "0")),
            F.col("state")
        ).otherwise(F.col("state_ship"))
    )
    .withColumn(
        "postcode_ship",
        F.when(
            (F.col("postcode_ship").isNull()) | 
            (F.trim(F.col("postcode_ship")) == "") |
            (F.lower(F.col("postcode_ship")).isin("nan", "null", "none", "0")),
            F.col("postcode")
        ).otherwise(F.col("postcode_ship"))
    )
    .withColumn(
        "street_ship",
        F.when(
            (F.col("street_ship").isNull()) | 
            (F.trim(F.col("street_ship")) == "") |
            (F.lower(F.col("street_ship")).isin("nan", "null", "none", "0")),
            F.col("street")
        ).otherwise(F.col("street_ship"))
    )
    .withColumn(
        "number_ship",
        F.when(
            (F.col("number_ship").isNull()) | 
            (F.trim(F.col("number_ship")) == "") |
            (F.lower(F.col("number_ship")).isin("nan", "null", "none", "0")),
            F.col("number")
        ).otherwise(F.col("number_ship"))
    )
    .drop("address_parts", "ship_to_address_clean", "tax_id", "tax_code", "state", "city", "postcode", "number", "unit", "region", "district", "valid_from", "valid_to", "street", "loyalty_segment_id", "ship_to_address")
)


In [0]:
display(df_silver.columns)

In [0]:
(
    df_silver
    .write
    .format("delta")
    .mode("overwrite")  # ou "append" se quiser acrescentar
    .saveAsTable("workspace.retail.silver_customers")
)


In [0]:
%sql
select * from workspace.retail.silver_customers