In [0]:
%run ./Utils

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

##1. Bronze Layer: Raw Data Ingestion

The Bronze layer stores raw data in delta format, without transformations. Here, we will simply write the raw data as Parquet.

Reads raw data from a CSV file in the landing zone and writes this data in Parquet format to the Bronze layer table.

#### Reading Data in LandingZone DBFS

Checking if table exists, i exists, process data using read/write streaming, else process data using read/write

In [0]:
#Define the table name and database
database_name = 'sales_case'
table_name = 'bronze_sales_table'

# Check if the table exists
table_exists = spark.sql(f"SHOW TABLES IN {database_name} LIKE '{table_name}'").count() > 0
table_exists

Out[25]: False

In [0]:
# IF exists, process using SparkStreaming
if table_exists:
    # Reading Spark 
    # Schemas("LandingZone") and GetLandingZoneDirectory() are imported from %run Utils NB
    df = spark.readStream \
        .format("csv") \
        .schema(Schemas("LandingZone")) \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(GetLandingZoneDirectory()) \
        .withColumn("filename", regexp_extract(input_file_name(), "([^/]+)$", 0))
else:
    # Schemas("LandingZone") and GetLandingZoneDirectory() are imported from %run Utils NB
    df = spark.read \
        .format("csv") \
        .schema(Schemas("LandingZone")) \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(GetLandingZoneDirectory()) \
        .withColumn("filename", regexp_extract(input_file_name(), "([^/]+)$", 0))
    
    display(df.take(10))    

ProductID,Date,ClientID,CampaignID,Units,Product,Category,Segment,ManufacturerID,Manufacturer,UnitCost,UnitPrice,PostalCode,EmailName,City,State,Region,District,Country,filename
449,2011-09-21,113077,21,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33180,"(Lysandra.Castaneda@xyza.com): Castaneda, Lysandra","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-10-10,234410,18,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33158,"(Dorothy.Rodriquez@xyza.com): Rodriquez, Dorothy","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-10-07,58091,20,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33186,"(Beau.Sutton@xyza.com): Sutton, Beau","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-10-08,114284,20,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33165,"(Boris.Leonard@xyza.com): Leonard, Boris","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-10-30,205070,18,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33179,"(Rafael.Fox@xyza.com): Fox, Rafael","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-11-22,139458,16,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33143,"(Martena.Guy@xyza.com): Guy, Martena","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-11-08,151912,16,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33155,"(Ina.Winters@xyza.com): Winters, Ina","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-08-17,176229,1,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33189,"(Kelly.Nicholson@xyza.com): Nicholson, Kelly","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-08-10,76694,3,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33176,"(Mufutau.Morton@xyza.com): Morton, Mufutau","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv
449,2011-09-23,146382,5,1,Maximus UM-54,Urban,Moderation,7,VanArsdel,74.7299175,102.36975,33175,"(Courtney.Marshall@xyza.com): Marshall, Courtney","Miami, FL, USA",FL,East,District #10,USA,dados_2011.csv


### Saving on Bronze Table

In [0]:
if table_exists:
  df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("checkpointLocation", f"/mnt/{database_name}/_checkpoint_{table_name}") \
    .table(f"{database_name}.{table_name}")
    #.start()

    #query.awaitTermination()  # This will wait for the query to process data
    #query.stop()  # Stop the query when ready
else:
  df.write \
    .mode("overwrite") \
    .format("parquet") \
    .option("checkpointLocation", f"/mnt/{database_name}/_checkpoint_{table_name}") \
    .saveAsTable(f"{database_name}.{table_name}")


In [0]:
#dbutils.fs.rm("/user/hive/warehouse/sales_case.db/bronze_sales_table", recurse=True)

Out[36]: True

In [0]:
%sql
describe table extended sales_case.bronze_sales_table

col_name,data_type,comment
ProductID,int,
Date,date,
ClientID,int,
CampaignID,int,
Units,int,
Product,string,
Category,string,
Segment,string,
ManufacturerID,int,
Manufacturer,string,


In [0]:
display(spark.read.table(f"{database_name}.{table_name}").take(10))

ProductID,Date,ClientID,CampaignID,Units,Product,Category,Segment,ManufacturerID,Manufacturer,UnitCost,UnitPrice,PostalCode,EmailName,City,State,Region,District,Country,filename
506,2011-11-11,234187,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55932,"(Magee.Flowers@xyza.com): Flowers, Magee","Elgin, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-24,249135,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55357,"(Adrian.Pacheco@xyza.com): Pacheco, Adrian","Loretto, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-21,124543,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55066,"(Ashely.Mcgowan@xyza.com): Mcgowan, Ashely","Red Wing, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-21,152994,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56082,"(Abel.Cardenas@xyza.com): Cardenas, Abel","Saint Peter, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-22,205534,15,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55070,"(Hall.Booth@xyza.com): Booth, Hall","Saint Francis, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-06-03,133314,17,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55792,"(Macon.Austin@xyza.com): Austin, Macon","Virginia, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-18,273553,13,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56628,"(Liberty.Solis@xyza.com): Solis, Liberty","Bigfork, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-05-29,19629,18,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55021,"(Isaiah.Witt@xyza.com): Witt, Isaiah","Faribault, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-04-27,222396,17,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,56329,"(Hedda.Oneill@xyza.com): Oneill, Hedda","Foley, MN, USA",MN,Central,District #28,USA,dados_2011.csv
506,2011-10-16,106354,16,1,Maximus UM-11,Urban,Moderation,7,VanArsdel,90.8264175,124.41975,55337,"(Marshall.Myers@xyza.com): Myers, Marshall","Burnsville, MN, USA",MN,Central,District #28,USA,dados_2011.csv


### Cleaning DF from Memory to optmmize

In [0]:
import gc
gc.collect()

df.unpersist()

Out[40]: DataFrame[ProductID: int, Date: date, ClientID: int, CampaignID: int, Units: int, Product: string, Category: string, Segment: string, ManufacturerID: int, Manufacturer: string, UnitCost: double, UnitPrice: double, PostalCode: string, EmailName: string, City: string, State: string, Region: string, District: string, Country: string, filename: string]