In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
from pyspark.sql.functions import to_date, date_format, col
from pyspark.sql.types import IntegerType, FloatType, TimestampType

df = spark.table("grao.bronze.grain_shipping")
df = (
    df
    .withColumn("id_envio", col("id_envio").cast(IntegerType()))
    .withColumn("ligacoes_do_cliente", col("ligações_do_cliente").cast(IntegerType()))
    .withColumn("avaliacao_do_cliente", col("avaliação_do_cliente").cast(IntegerType()))
    .withColumn("preco", col("preço").cast(FloatType()))
    .withColumn("qtd_itens", col("qtd_itens").cast(IntegerType()))
    .withColumn("desconto", col("desconto").cast(FloatType()))
    .withColumn("peso_g", col("peso_g").cast(FloatType()))
    .withColumn("avaliacaoEntrega", col("avaliacaoEntrega").cast(IntegerType()))
    .withColumn("dtIngest", col("dtIngest").cast(TimestampType()))
)

In [0]:
from pyspark.sql.functions import sequence, explode, date_format, col, year, month, dayofmonth, expr

# Define o range de datas desejado
data_inicio = "2020-01-01"
data_fim = "2030-12-31"

# Cria DataFrame com sequência de datas
df_calendario = (
    spark
    .range(1)
    .select(sequence(expr(f"to_date('{data_inicio}')"), expr(f"to_date('{data_fim}')")).alias("datas"))
    .select(explode(col("datas")).alias("data"))
)

# Adiciona colunas formatadas
df_calendario = (
    df_calendario
    .withColumn("data_completa", date_format(col("data"), "EEEE, d 'de' MMMM 'de' yyyy"))
    .withColumn("ano", year(col("data")))
    .withColumn("mes", month(col("data")))
    .withColumn("dia", dayofmonth(col("data")))
    .withColumn("data_ddMMyyyy", date_format(col("data"), "dd-MM-yyyy"))
)

# Salva como tabela bronze.calendario
df_calendario.write.mode("overwrite").format("delta").saveAsTable("bronze.calendario")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, TimestampType

# Mapeia meses em português para número
meses_map = {
    "janeiro": "01", "fevereiro": "02", "março": "03", "abril": "04",
    "maio": "05", "junho": "06", "julho": "07", "agosto": "08",
    "setembro": "09", "outubro": "10", "novembro": "11", "dezembro": "12"
}

# 1. Remove dia da semana e vírgula
df = df.withColumn("data_limpa", F.regexp_replace("dataEnvio", "^[^,]+,\\s*", ""))

# 2. Extrai dia (número antes do " de")
df = df.withColumn("dia", F.regexp_extract("data_limpa", r"(\d{1,2}) de", 1))

# 3. Extrai mês (palavra entre "de " e " de" )
df = df.withColumn("mes_texto", F.regexp_extract("data_limpa", r"de ([a-zç]+) de", 1))

# 4. Extrai ano (número no final)
df = df.withColumn("ano", F.regexp_extract("data_limpa", r"(\d{4})$", 1))

# 5. Cria coluna mês numérico usando mapping via expr CASE WHEN
from pyspark.sql.functions import expr

case_expr = "CASE "
for pt, num in meses_map.items():
    case_expr += f"WHEN mes_texto = '{pt}' THEN '{num}' "
case_expr += "END as mes_num"

df = df.withColumn("mes_num", expr(case_expr))

# 6. Concatena para formar yyyy-MM-dd (formato ISO)
df = df.withColumn(
    "dataEnvio",
    F.to_date(F.concat_ws("-", "ano", "mes_num", "dia"), "yyyy-MM-dd")
)

# Cast outras colunas normalmente
df = df.withColumn("id_envio", F.col("id_envio").cast(IntegerType())) \
    .withColumn("ligacoes_do_cliente", F.col("ligações_do_cliente").cast(IntegerType())) \
    .withColumn("avaliacao_do_cliente", F.col("avaliação_do_cliente").cast(IntegerType())) \
    .withColumn("preco", F.col("preço").cast(FloatType())) \
    .withColumn("qtd_itens", F.col("qtd_itens").cast(IntegerType())) \
    .withColumn("desconto", F.col("desconto").cast(FloatType())) \
    .withColumn("peso_g", F.col("peso_g").cast(FloatType())) \
    .withColumn("avaliacaoEntrega", F.col("avaliacaoEntrega").cast(IntegerType())) \
    .withColumn("dtIngest", F.col("dtIngest").cast(TimestampType()))

# Remove colunas temporárias
df = df.drop("data_limpa", "dia", "mes_texto", "ano", "mes_num")
df.display()

In [0]:
df = spark.table("grao.bronze.grain_shipping")
df = (
    df
    .withColumn("id_envio", F.col("id_envio").cast(IntegerType()))
    .withColumn("ligacoes_do_cliente", F.col("ligações_do_cliente").cast(IntegerType()))
    .withColumn("avaliacao_do_cliente", F.col("avaliação_do_cliente").cast(IntegerType()))
    .withColumn("preco", F.col("preço").cast(FloatType()))
    .withColumn("qtd_itens", F.col("qtd_itens").cast(IntegerType()))
    .withColumn("desconto", F.col("desconto").cast(FloatType()))
    .withColumn("peso_g", F.col("peso_g").cast(FloatType()))
    .withColumn("dataEnvio", F.to_date(F.date_format(F.col("dataEnvio"), "EEEE, d 'de' MMMM 'de' yyyy")))
    .withColumn("dataEntrega", F.to_date(F.date_format(F.col("dataEntrega"), "EEEE, d 'de' MMMM 'de' yyyy")))
    .withColumn("avaliacaoEntrega", F.col("avaliacaoEntrega").cast(IntegerType()))
    .withColumn("dtIngest", F.col("dtIngest").cast(TimestampType()))
)
display(df)