In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
df = spark.table("grao.bronze.grain_shipping")

duplicados = df.groupBy("id_envio").count().filter(F.col("count") > 1 )

window_spec = Window.partitionBy("id_envio")
df = df.withColumn("valid", F.when(F.count("id_envio").over(window_spec) == 1, F.lit(1)).otherwise(F.lit(0)))
if duplicados.count() > 0:
    duplicados.display()
    print(f"Existem id_envio duplicados no dataframe {duplicados.count()}")
else:
    print("Não existem id_duplicados")

In [0]:
df = df.filter(F.col("valid") == 1)

In [0]:
df = (
    df
    .withColumn("id_envio", F.col("id_envio").cast(IntegerType()))
    .withColumn("ligacoes_do_cliente", F.col("ligações_do_cliente").cast(IntegerType()))
    .withColumn("avaliacao_do_cliente", F.col("avaliação_do_cliente").cast(IntegerType()))
    .withColumn("preco", F.col("preço").cast(FloatType()))
    .withColumn("qtd_itens", F.col("qtd_itens").cast(IntegerType()))
    .withColumn("desconto", F.col("desconto").cast(FloatType()))
    .withColumn("peso_g", F.col("peso_g").cast(FloatType()))
    .withColumn("avaliacaoEntrega", F.col("avaliacaoEntrega").cast(IntegerType()))
    .withColumn("dtIngest", F.col("dtIngest").cast(TimestampType()))
)

In [0]:
df_calendario = spark.table("grao.bronze.calendario")

In [0]:
df_2 = df.join(df_calendario, df.DataEnvio == df_calendario.data_completa, "inner")
df_2 = df_2.withColumnRenamed("data", "dtenvio")
df_2 = df_2.join(df_calendario, df.dataEntrega == df_calendario.data_completa, "inner")
df_2 = df_2.withColumnRenamed("data", "dtentrega").drop("ligações_do_cliente", "avaliação_do_cliente", "preço")

In [0]:
df_final = df_2
for col_name in df_2.columns:
    df_final = df_2.withColumnRenamed(col_name, col_name.lower())

In [0]:
columns = [   
            "id_envio",
            "corredor_de_armazenagem",
            "metodo_de_envio",
            "ligacoes_do_cliente",
            "avaliacao_do_cliente",
            "preco",
            "qtd_itens",
            "importancia",
            "genero",
            "desconto",
            "peso_g",
            "chegou_no_tempo",
            "destino",
            "dtenvio",
            "dtentrega",
            "avaliacaoentrega",
            "dtingest",
]

df_final = df_final.select(*columns)

In [0]:
df_final.write.format("delta").mode("overwrite").saveAsTable("grao.silver.grain_shipping")