In [0]:
%run ./Utils

In [0]:
from pyspark.sql.functions import format_number
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Silver Layer: Cleaning and Transformation

Apply transformations and denormalize the data in the Silver layer. Use partitioning to improve read and write performance.

In [0]:
# Var in this notebook
Bronze_path = "sales_case.bronze_sales_table"

#Define the table name and database
database_name = 'sales_case'
table_name = 'silver_sales_table'

# Check if the table exists
Table_exists = spark.sql(f"SHOW TABLES IN {database_name} LIKE '{table_name}'").count() > 0
Table_exists

Out[9]: False

#### Reading data from Bronze Layer to clean and transform it on Silver Layer

In [0]:
# Reading from a Delta table
if Table_exists:
    df_bronze = spark.readStream.table(Bronze_path)
else:
    df_bronze = spark.read.table(Bronze_path)

display(df_bronze.take(10))

ProductID,Date,ClientID,CampaignID,Units,Product,Category,Segment,ManufacturerID,Manufacturer,UnitCost,UnitPrice,PostalCode,EmailName,City,State,Region,District,Country,filename
506,2011-11-11,234187,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55932,"(Magee.Flowers@xyza.com): Flowers, Magee","Elgin, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-24,249135,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55357,"(Adrian.Pacheco@xyza.com): Pacheco, Adrian","Loretto, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-21,124543,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55066,"(Ashely.Mcgowan@xyza.com): Mcgowan, Ashely","Red Wing, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-21,152994,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56082,"(Abel.Cardenas@xyza.com): Cardenas, Abel","Saint Peter, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-22,205534,15,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55070,"(Hall.Booth@xyza.com): Booth, Hall","Saint Francis, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-06-03,133314,17,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55792,"(Macon.Austin@xyza.com): Austin, Macon","Virginia, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-18,273553,13,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56628,"(Liberty.Solis@xyza.com): Solis, Liberty","Bigfork, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-29,19629,18,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55021,"(Isaiah.Witt@xyza.com): Witt, Isaiah","Faribault, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-04-27,222396,17,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56329,"(Hedda.Oneill@xyza.com): Oneill, Hedda","Foley, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-10-16,106354,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55337,"(Marshall.Myers@xyza.com): Myers, Marshall","Burnsville, MN, USA",MN,Central,District #28,USA,dados_2011.csv


### Data Cleaning
Data cleaning is a crucial process to ensure data quality. This involves removing duplicate or incorrect data, standardizing data formats and values, and enriching data with additional information. Furthermore, it is important to check and correct quality issues, such as errors and inconsistencies, to ensure that the data is accurate and reliable.

In [0]:
# Do transformations, including manipulation of email and CampaignID fields
df_silver = df_bronze.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd")) \
                     .withColumn("Email", lower(expr("regexp_replace(split(EmailName, ':')[0], '[()]', '')"))) \
                     .withColumn("Name", expr("split(split(EmailName, ':')[1], ', ')")) \
                     .withColumn("Name", expr("concat(Name[1], ' ', Name[0])")) \
                     .withColumn("City", expr("split(City, ',')[0]")) \
                     .withColumn("UnitPrice", format_number(col("UnitPrice"), 2)) \
                     .withColumn("UnitCost", format_number(col("UnitCost"), 2)) \
                     .withColumn("SalesTotal", format_number(col("UnitCost") * col("Units"),2)) \
                     .drop("EmailName")\
                     .drop("CampaignID")

display(df_silver.take(10))
                     

ProductID,Date,ClientID,Units,Product,Category,Segment,ManufacturerID,Manufacturer,UnitCost,UnitPrice,PostalCode,City,State,Region,District,Country,filename,Email,Name,SalesTotal
506,2011-11-11,234187,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55932,Elgin,MN,Central,District #28,USA,dados_2011.csv,magee.flowers@xyza.com,Magee Flowers,90.83
506,2011-05-24,249135,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55357,Loretto,MN,Central,District #28,USA,dados_2011.csv,adrian.pacheco@xyza.com,Adrian Pacheco,90.83
506,2011-05-21,124543,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55066,Red Wing,MN,Central,District #28,USA,dados_2011.csv,ashely.mcgowan@xyza.com,Ashely Mcgowan,90.83
506,2011-05-21,152994,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,56082,Saint Peter,MN,Central,District #28,USA,dados_2011.csv,abel.cardenas@xyza.com,Abel Cardenas,90.83
506,2011-05-22,205534,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55070,Saint Francis,MN,Central,District #28,USA,dados_2011.csv,hall.booth@xyza.com,Hall Booth,90.83
506,2011-06-03,133314,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55792,Virginia,MN,Central,District #28,USA,dados_2011.csv,macon.austin@xyza.com,Macon Austin,90.83
506,2011-05-18,273553,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,56628,Bigfork,MN,Central,District #28,USA,dados_2011.csv,liberty.solis@xyza.com,Liberty Solis,90.83
506,2011-05-29,19629,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55021,Faribault,MN,Central,District #28,USA,dados_2011.csv,isaiah.witt@xyza.com,Isaiah Witt,90.83
506,2011-04-27,222396,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,56329,Foley,MN,Central,District #28,USA,dados_2011.csv,hedda.oneill@xyza.com,Hedda Oneill,90.83
506,2011-10-16,106354,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55337,Burnsville,MN,Central,District #28,USA,dados_2011.csv,marshall.myers@xyza.com,Marshall Myers,90.83


### Save Silver Transformations

Partitioning by year and month to optimize date-based queries, with a recommendation for file size in Delta format.

In [0]:
if Table_exists:
  df_silver.writeStream \
    .outputMode("append") \
    .withColumn("Year", year("Date")) \
    .withColumn("Month", month("Date")) \
    .partitionBy("Year", "Month") \
    .format("parquet") \
    .option("checkpointLocation", f"/mnt/{database_name}/_checkpoint_{table_name}") \
    .table(f"{database_name}.{table_name}")

else:
  df_silver.withColumn("Year", year("Date")) \
    .withColumn("Month", month("Date")) \
    .write.option("maxRecordsPerFile", 50000) \
    .partitionBy("Year", "Month") \
    .mode("overwrite") \
    .format("parquet") \
    .option("checkpointLocation", f"/mnt/{database_name}/_checkpoint_{table_name}") \
    .saveAsTable(f"{database_name}.{table_name}")

df_silver.count()

Out[21]: 353888

In [0]:
%sql
describe table extended sales_case.silver_sales_table

col_name,data_type,comment
ProductID,int,
Date,date,
ClientID,int,
Units,int,
Product,string,
Category,string,
Segment,string,
ManufacturerID,int,
Manufacturer,string,
UnitCost,string,


In [0]:
display(spark.read.table(f"{database_name}.{table_name}").take(10))

ProductID,Date,ClientID,Units,Product,Category,Segment,ManufacturerID,Manufacturer,UnitCost,UnitPrice,PostalCode,City,State,Region,District,Country,filename,Email,Name,SalesTotal,Year,Month
506,2011-01-02,159938,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55041,Lake City,MN,Central,District #28,USA,dados_2011.csv,pascale.ferguson@xyza.com,Pascale Ferguson,90.83,2011,1
506,2011-01-27,158876,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55316,Champlin,MN,Central,District #28,USA,dados_2011.csv,regina.villarreal@xyza.com,Regina Villarreal,90.83,2011,1
506,2011-01-28,100427,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,56401,Brainerd,MN,Central,District #28,USA,dados_2011.csv,ciara.alvarado@xyza.com,Ciara Alvarado,90.83,2011,1
506,2011-01-30,219021,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55709,Bovey,MN,Central,District #28,USA,dados_2011.csv,yoko.english@xyza.com,Yoko English,90.83,2011,1
506,2011-01-29,280795,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55816,Duluth,MN,Central,District #28,USA,dados_2011.csv,eve.hamilton@xyza.com,Eve Hamilton,90.83,2011,1
506,2011-01-30,30565,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,56001,Mankato,MN,Central,District #28,USA,dados_2011.csv,hoyt.ashley@xyza.com,Hoyt Ashley,90.83,2011,1
506,2011-01-27,168503,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55117,Saint Paul,MN,Central,District #28,USA,dados_2011.csv,morgan.smith@xyza.com,Morgan Smith,90.83,2011,1
506,2011-01-27,162455,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55127,Saint Paul,MN,Central,District #28,USA,dados_2011.csv,talon.hudson@xyza.com,Talon Hudson,90.83,2011,1
506,2011-01-22,119705,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55109,Saint Paul,MN,Central,District #28,USA,dados_2011.csv,aphrodite.simmons@xyza.com,Aphrodite Simmons,90.83,2011,1
506,2011-01-29,162456,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.83,124.42,55127,Saint Paul,MN,Central,District #28,USA,dados_2011.csv,kibo.monroe@xyza.com,Kibo Monroe,90.83,2011,1


### Cleaning DF from Memory to optmmize

In [0]:
import gc
gc.collect()

df_bronze.unpersist()
df_silver.unpersist()

Out[24]: DataFrame[ProductID: int, Date: date, ClientID: int, Units: int, Product: string, Category: string, Segment: string, ManufacturerID: int, Manufacturer: string, UnitCost: string, UnitPrice: string, PostalCode: string, City: string, State: string, Region: string, District: string, Country: string, filename: string, Email: string, Name: string, SalesTotal: string]