In [1]:
#import libraries
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.functions import row_number
from pyspark.sql.types import DecimalType
import traceback

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 3, Finished, Available, Finished)

In [2]:
#Parameterize workspace and lakehouse name
Workspace = "NewRetailCompany"
Lakehouse = "Staging_Zone"

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 4, Finished, Available, Finished)

In [3]:
#Renaming column names of rebate lz and type castin columns

lz_rebate_df = spark.read.table("lz_rebate")

# Assign columns names and type cast
col_names = [
    "SalesRepID","SalesRepName","SalesRepRegion","CustomerCountry",
    "CustomerRegion","SaleLocation","CurrencyCode","CustomerName",
    "SaleTransactionID","CustomerID","RebateDatetime",
    "RebateTransactionID","RebateType","RebateAmount"
]

for i, col_name in enumerate(lz_rebate_df.columns):
    lz_rebate_df = lz_rebate_df.withColumnRenamed(col_name, col_names[i])

try:
    sz_rebate_df = lz_rebate_df \
        .withColumn("RebateDatetime", F.to_timestamp("RebateDatetime")) \
        .withColumn("RebateAmount", F.col("RebateAmount").cast(DecimalType(18,2)))

except AnalysisException as e:
    print("Spark AnalysisException occurred:", str(e))
    traceback.print_exc()

except Exception as e:
    print("Unexpected error occurred:", str(e))
    traceback.print_exc()



StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 5, Finished, Available, Finished)

In [4]:
sz_rebate_df.printSchema()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 6, Finished, Available, Finished)

root
 |-- SalesRepID: string (nullable = true)
 |-- SalesRepName: string (nullable = true)
 |-- SalesRepRegion: string (nullable = true)
 |-- CustomerCountry: string (nullable = true)
 |-- CustomerRegion: string (nullable = true)
 |-- SaleLocation: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- SaleTransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- RebateDatetime: timestamp (nullable = true)
 |-- RebateTransactionID: string (nullable = true)
 |-- RebateType: string (nullable = true)
 |-- RebateAmount: decimal(18,2) (nullable = true)



In [5]:
# Trim string columns to remove leading/trailing spaces
sz_rebate_df = sz_rebate_df \
    .withColumn("SalesRepID", F.trim(F.col("SalesRepID"))) \
    .withColumn("CustomerID", F.trim(F.col("CustomerID"))) \
    .withColumn("SaleTransactionID", F.trim(F.col("SaleTransactionID"))) \
    .withColumn("RebateTransactionID", F.trim(F.col("RebateTransactionID")))

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 7, Finished, Available, Finished)

In [6]:
#First letter uppercase for names, region and location
#Country code and curreny code in uppercase
sz_rebate_df = sz_rebate_df \
    .withColumn("SalesRepName", F.initcap(F.col("SalesRepName"))) \
    .withColumn("SalesRepRegion", F.initcap(F.col("SalesRepRegion"))) \
    .withColumn("CustomerName", F.initcap(F.col("CustomerName"))) \
    .withColumn("CustomerRegion", F.initcap(F.col("CustomerRegion"))) \
    .withColumn("SaleLocation", F.initcap(F.col("SaleLocation"))) \
    .withColumn("RebateType", F.initcap(F.col("RebateType"))) \
    .withColumn("CustomerCountry", F.upper(F.col("CustomerCountry"))) \
    .withColumn("CurrencyCode", F.upper(F.col("CurrencyCode")))

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 8, Finished, Available, Finished)

In [7]:
sz_rebate_df.count()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 9, Finished, Available, Finished)

2709

In [8]:
window_spec_rebate = Window.partitionBy("RebateTransactionID").orderBy(F.col("RebateDatetime").desc())

# Rank rows within each RebateTransactionID
sz_rebate_df = sz_rebate_df \
    .withColumn("row_num", F.row_number().over(window_spec_rebate)) \
    .filter(F.col("row_num") == 1) \
    .drop("row_num")

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 10, Finished, Available, Finished)

In [9]:
sz_rebate_df.count()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 11, Finished, Available, Finished)

2701

In [10]:
sz_rebate_df.printSchema()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 12, Finished, Available, Finished)

root
 |-- SalesRepID: string (nullable = true)
 |-- SalesRepName: string (nullable = true)
 |-- SalesRepRegion: string (nullable = true)
 |-- CustomerCountry: string (nullable = true)
 |-- CustomerRegion: string (nullable = true)
 |-- SaleLocation: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- SaleTransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- RebateDatetime: timestamp (nullable = true)
 |-- RebateTransactionID: string (nullable = true)
 |-- RebateType: string (nullable = true)
 |-- RebateAmount: decimal(18,2) (nullable = true)



In [11]:
#Column Sales Location contains continent and country
#Normalizing column Sales Location into sub columns
#And drop Hierarchy columns
sz_rebate_df = sz_rebate_df.withColumn(
    "SaleLocationHierarchy", 
    F.split(F.col("SaleLocation"),"\\|")
    )
sz_rebate_df = sz_rebate_df \
    .withColumn("SaleRegion", F.col("SaleLocationHierarchy")[0]) \
    .withColumn("SaleCountry", F.col("SaleLocationHierarchy")[1]) 
   


StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 13, Finished, Available, Finished)

In [12]:
#drop extra columns
sz_rebate_df = sz_rebate_df.drop("SaleLocationHierarchy","SaleLocation")
sz_rebate_df.printSchema()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 14, Finished, Available, Finished)

root
 |-- SalesRepID: string (nullable = true)
 |-- SalesRepName: string (nullable = true)
 |-- SalesRepRegion: string (nullable = true)
 |-- CustomerCountry: string (nullable = true)
 |-- CustomerRegion: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- SaleTransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- RebateDatetime: timestamp (nullable = true)
 |-- RebateTransactionID: string (nullable = true)
 |-- RebateType: string (nullable = true)
 |-- RebateAmount: decimal(18,2) (nullable = true)
 |-- SaleRegion: string (nullable = true)
 |-- SaleCountry: string (nullable = true)



In [13]:
sz_rebate_df.count()


StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 15, Finished, Available, Finished)

2701

In [14]:
#rebateamount cotains outlier(27 rows)
sz_rebate_df = sz_rebate_df.where(F.col("RebateAmount") <= 3000)


StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 16, Finished, Available, Finished)

In [15]:
sz_rebate_df.count()


StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 17, Finished, Available, Finished)

2673

In [18]:
# write transformed data to silver layer
try:
    
    sz_rebate_df.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/sz_rebate")

    print(" Silver layer rebate transformation completed successfully!")

except Exception as e:
    print("Unexpected error occurred:", str(e))
    traceback.print_exc()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 20, Finished, Available, Finished)

 Silver layer rebate transformation completed successfully!


In [17]:
#test to check is data is saved in silver layer
try:
    spark.read.format("delta")\
              .load("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/sz_rebate")\
              .show(5)

except Exception as e:
    print("Unexpected error occurred:", str(e))
    traceback.print_exc()

StatementMeta(, 3fc53680-e9be-499e-bf14-195d38135a45, 19, Finished, Available, Finished)

+----------+----------------+--------------+---------------+--------------+------------+----------------+-----------------+----------+-------------------+-------------------+----------+------------+-------------+-----------+
|SalesRepID|    SalesRepName|SalesRepRegion|CustomerCountry|CustomerRegion|CurrencyCode|    CustomerName|SaleTransactionID|CustomerID|     RebateDatetime|RebateTransactionID|RebateType|RebateAmount|   SaleRegion|SaleCountry|
+----------+----------------+--------------+---------------+--------------+------------+----------------+-----------------+----------+-------------------+-------------------+----------+------------+-------------+-----------+
| 68786-160|Brendis Blankley|          Emea|         MEXICO| North America|         EUR|Samantha Rosales|    8221046822414|SR98078592|2025-02-18 14:57:43|      0001062538722|     Value|        1.16|         Emea|     greece|
|49288-0943| Roderich Bevens|          Emea|         FRANCE|          Emea|         CAD|    Robin Ab