### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_DISCOUNTS_PATH, SILVER_DISCOUNTS_PATH
from src.schema_definitions import SILVER_DISCOUNTS_SCHEMA

### Querying the Bronze Discounts Table

In [0]:
disc_bronze_df = spark.read.table(BRONZE_DISCOUNTS_PATH)
disc_bronze_df.limit(5).display()

Start,End,Discont,Description,Category,Sub Category,ingestion_ts,_source_file
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Sweaters and Knitwear,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Sweaters and Sweatshirts,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Children,Coats,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv


### Silver Discounts Schema Reference

In [0]:
SILVER_DISCOUNTS_SCHEMA

{'discount_start_date': 'date',
 'discount_end_date': 'date',
 'discount': 'double',
 'description': 'string',
 'category': 'string',
 'sub_category': 'string'}

### Schema Enforcement and Column Name Standardizing

In [0]:
disc_silver_df = disc_bronze_df.select(
    col("Start").cast(DateType()).alias("discount_start_date"),
    col("End").cast(DateType()).alias("discount_end_date"),
    col("Discont").cast(DoubleType()).alias("discount"),
    col("Description").cast(StringType()).alias("description"),
    col("Category").cast(StringType()).alias("category"),
    col("Sub Category").cast(StringType()).alias("sub_category"),
    col("ingestion_ts").cast(TimestampType()),
    col("_source_file").cast(StringType()).alias("_source_file")
)

### Cleaning Empty spaces and Formatting Values

In [0]:
disc_silver_df = (
    disc_silver_df.withColumn("description", trim(col("description")))
    .withColumn("category", trim(col("category")))
    .withColumn("sub_category", trim(col("sub_category")))
)

In [0]:
disc_silver_df.limit(5).display()

discount_start_date,discount_end_date,discount,description,category,sub_category,ingestion_ts,_source_file
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Sweaters and Knitwear,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Sweaters and Sweatshirts,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Children,Coats,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {disc_silver_df.count()}")
cust_silver_df = disc_silver_df.dropDuplicates()
print(f"After deduplicate count : {disc_silver_df.count()}")

Before deduplicate count : 181
After deduplicate count : 181


### Null Check

In [0]:
null_counts = disc_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in disc_silver_df.columns])
display(null_counts)

discount_start_date,discount_end_date,discount,description,category,sub_category,ingestion_ts,_source_file
0,0,0,0,10,10,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_DISCOUNTS_SCHEMA.keys())
incoming_cols = set(disc_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Discounts Table

In [0]:
if not spark.catalog.tableExists(SILVER_DISCOUNTS_PATH):
    (
        disc_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_DISCOUNTS_PATH)
    )
else:
    disc_silver_tbl = DeltaTable.forName(spark, SILVER_DISCOUNTS_PATH)

    disc_silver_tbl.alias("tgt").merge(
        disc_silver_df.alias("src"),
        "tgt.discount_start_date = src.discount_start_date "
        "AND tgt.discount_end_date = src.discount_end_date "
        "AND tgt.category = src.category "
        "AND tgt.sub_category = src.sub_category"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_DISCOUNTS_PATH).limit(5).display()

discount_start_date,discount_end_date,discount,description,category,sub_category,ingestion_ts,_source_file
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Feminine,Sweaters and Knitwear,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Coats and Blazers,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Masculine,Sweaters and Sweatshirts,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv
2020-01-01,2020-01-10,0.4,40% discount during our New Year Winter Sale,Children,Coats,2026-01-14T05:41:44.554Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/discounts.csv


In [0]:
spark.read.table(SILVER_DISCOUNTS_PATH).count()

181