# Tabela Vendas Work

## Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Análise Inicial

In [0]:
vendas = spark.sql("SELECT * FROM raw.vendas_main_raw LIMIT 100")
vendas.display()

pdv,cupom,pco,desconto,pcotot,descontotot,qtd,data_ins,hora,ts_referencia,data,produto,cod_loja
504,97994,39.99,0.0,39.99,0.0,1.0,2019-12-23 22:01:02.950,21:51:14,2021-01-22 16:04:32.195,2019-12-23,3109710,95
303,70,3.33,1.66,3.33,1.66,1.0,2019-12-23 21:01:02.613,20:47:46,2021-01-22 16:04:32.195,2019-12-23,3109862,106
305,151,3.33,1.66,3.33,1.66,1.0,2019-12-23 21:11:02.080,21:00:52,2021-01-22 16:04:32.195,2019-12-23,3109862,106
204,19221,5.99,0.0,5.99,0.0,1.0,2019-12-23 19:45:02.913,18:28:20,2021-01-22 16:04:32.195,2019-12-23,3108683,104
304,40,10.49,0.0,10.49,0.0,1.0,2019-12-23 09:23:01.957,09:05:28,2021-01-22 16:04:32.195,2019-12-23,3071480,92
303,154,2.99,0.0,2.99,0.0,1.0,2019-12-23 18:51:02.850,18:17:05,2021-01-22 16:04:32.195,2019-12-23,3113746,91
201,8785,10.99,0.0,10.99,0.0,1.0,2019-12-23 16:51:02.557,15:37:01,2021-01-22 16:04:32.195,2019-12-23,3113198,97
2,101050,1.99,0.0,1.99,0.0,1.0,2019-12-23 15:29:02.720,14:22:27,2021-01-22 16:04:32.195,2019-12-23,3099188,100
7,58343,1.99,0.0,1.99,0.0,1.0,2019-12-23 15:51:02.277,15:18:15,2021-01-22 16:04:32.195,2019-12-23,3099188,100
302,310,3.33,1.66,3.33,1.66,1.0,2019-12-23 22:01:02.950,21:52:36,2021-01-22 16:04:32.195,2019-12-23,3109948,89


## Casting

In [0]:
vendas = vendas.withColumn('data', to_date('data', 'yyyy-MM-dd')) \
    .withColumn('cod_loja', regexp_replace('cod_loja', ',', '.').cast(IntegerType())) \
    .withColumn('produto', regexp_replace('produto', ',', '.').cast(IntegerType())) \
    .withColumn('pdv', regexp_replace('pdv', ',', '.').cast(IntegerType())) \
    .withColumn('cupom', regexp_replace('cupom', ',', '.').cast(IntegerType())) \
    .withColumn('pco', regexp_replace('pco', ',', '.').cast(DoubleType())) \
    .withColumn('desconto', regexp_replace('desconto', ',', '.').cast(DoubleType())) \
    .withColumn('pcotot', regexp_replace('pcotot', ',', '.').cast(DoubleType())) \
    .withColumn('descontotot', regexp_replace('descontotot', ',', '.').cast(DoubleType())) \
    .withColumn('qtd', regexp_replace('qtd', ',', '.').cast(IntegerType()))

## Renomeando e Organizando as Colunas

In [0]:
vendas = vendas.withColumnRenamed('data', 'data_extracao') \
    .withColumnRenamed('hora', 'hora_extracao') \
    .withColumnRenamed('produto', 'cod_produto') \
    .withColumnRenamed('pdv', 'ponto_de_venda') \
    .withColumnRenamed('pco', 'preco_unitario') \
    .withColumnRenamed('desconto', 'desconto_unitario') \
    .withColumnRenamed('pcotot', 'preco_total') \
    .withColumnRenamed('descontotot', 'desconto_total') \
    .withColumnRenamed('qtd', 'quantidade_vendida')

In [0]:
vendas = vendas[['data_extracao', 
 'hora_extracao',
 'cod_loja',
 'cod_produto',
 'ponto_de_venda',
 'cupom',
 'preco_unitario',
 'desconto_unitario',
 'preco_total',
 'desconto_total',
 'quantidade_vendida']]

## Removendo linhas duplicadas

In [0]:
vendas = vendas.coalesce(1).dropDuplicates()

## Salvando Dados

In [0]:
vendas.write \
    .option('path', '/mnt/work/vendas_main_dev') \
    .partitionBy(['data_extracao', 'cod_loja', 'cod_produto']) \
    .mode('overwrite') \
    .saveAsTable('work.vendas_main_work')

## Otimizando

In [0]:
%sql
OPTIMIZE work.vendas_main_work

path,metrics
dbfs:/mnt/work/vendas_main_dev,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 63, null, 0, 63, 63, true, 0, 0, 1680132555296, 1680132556149, 8, 0, null)"
