### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import   BRONZE_TRANSACTIONS_PATH, SILVER_TRANSACTIONS_PATH
from src.schema_definitions import SILVER_TRANSACTIONS_SCHEMA

### Querying the Bronze Transactions Table

In [0]:
trans_bronze_df = spark.read.table(BRONZE_TRANSACTIONS_PATH)
trans_bronze_df.limit(5).display()

Invoice ID,Line,Customer ID,Product ID,Size,Color,Unit Price,Quantity,Date,Discount,Line Total,Store ID,Employee ID,Currency,Currency Symbol,SKU,Transaction Type,Payment Method,Invoice Total,ingestion_ts,_source_file
INV-US-001-03558761,1,47162,485,M,,80.5,1,2023-01-01T15:42:00.000Z,0.0,80.5,1,7,USD,$,MASU485-M-,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558761,2,47162,2779,G,,31.5,1,2023-01-01T15:42:00.000Z,0.4,18.9,1,7,USD,$,CHCO2779-G-,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558761,3,47162,64,M,NEUTRAL,45.5,1,2023-01-01T15:42:00.000Z,0.4,27.3,1,7,USD,$,MACO64-M-NEUTRAL,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558762,1,10142,131,M,BLUE,70.0,1,2023-01-01T20:04:00.000Z,0.4,42.0,1,6,USD,$,FECO131-M-BLUE,Sale,Cash,77.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558762,2,10142,716,L,WHITE,26.0,1,2023-01-01T20:04:00.000Z,0.0,26.0,1,6,USD,$,MAT-716-L-WHITE,Sale,Cash,77.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv


### Silver Transactions Schema Reference

In [0]:
SILVER_TRANSACTIONS_SCHEMA

{'invoice_id': 'string',
 'line': 'integer',
 'customer_id': 'integer',
 'product_id': 'integer',
 'size': 'string',
 'color': 'string',
 'unit_price': 'double',
 'quantity': 'integer',
 'date': 'timestamp',
 'discount': 'double',
 'line_total': 'double',
 'store_id': 'integer',
 'employee_id': 'integer',
 'currency': 'string',
 'currency_symbol': 'string',
 'sku': 'string',
 'transaction_type': 'string',
 'payment_method': 'string',
 'invoice_total': 'double'}

### Schema Enforecement and Column Name Standardizing

In [0]:
trans_silver_df = trans_bronze_df.select(
    col("Invoice ID").cast(StringType()).alias("invoice_id"),
    col("Line").cast(IntegerType()).alias("line"),
    col("Customer ID").cast(IntegerType()).alias("customer_id"),
    col("Product ID").cast(IntegerType()).alias("product_id"),
    col("Size").cast(StringType()).alias("size"),
    col("Color").cast(StringType()).alias("color"),
    col("Unit Price").cast(DoubleType()).alias("unit_price"),
    col("Quantity").cast(IntegerType()).alias("quantity"),
    col("Date").cast(TimestampType()).alias("date"),
    col("Discount").cast(DoubleType()).alias("discount"),
    col("Line Total").cast(DoubleType()).alias("line_total"),
    col("Store ID").cast(IntegerType()).alias("store_id"),
    col("Employee ID").cast(IntegerType()).alias("employee_id"),
    col("Currency").cast(StringType()).alias("currency"),
    col("Currency Symbol").cast(StringType()).alias("currency_symbol"),
    col("SKU").cast(StringType()).alias("sku"),
    col("Transaction Type").cast(StringType()).alias("transaction_type"),
    col("Payment Method").cast(StringType()).alias("payment_method"),
    col("Invoice Total").cast(DoubleType()).alias("invoice_total"),
    col("ingestion_ts"),
    col("_source_file")
)

### Cleaning Empty spaces and Formatting Values

In [0]:
trans_silver_df = (
    trans_silver_df.withColumn("invoice_id", trim(col("invoice_id")))
    .withColumn("size", upper(trim(col("size"))))
    .withColumn("color", upper(trim(col("color"))))
    .withColumn("currency", upper(trim(col("currency"))))
    .withColumn("currency_symbol", trim(col("currency_symbol")))   
    .withColumn("sku", upper(trim(col("sku"))))
    .withColumn("transaction_type", initcap(trim(col("transaction_type"))))
    .withColumn("payment_method", initcap(trim(col("payment_method"))))
)

In [0]:
trans_silver_df.limit(5).display()


invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file
INV-US-001-03558761,1,47162,485,M,,80.5,1,2023-01-01T15:42:00.000Z,0.0,80.5,1,7,USD,$,MASU485-M-,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558761,2,47162,2779,G,,31.5,1,2023-01-01T15:42:00.000Z,0.4,18.9,1,7,USD,$,CHCO2779-G-,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558761,3,47162,64,M,NEUTRAL,45.5,1,2023-01-01T15:42:00.000Z,0.4,27.3,1,7,USD,$,MACO64-M-NEUTRAL,Sale,Cash,126.7,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558762,1,10142,131,M,BLUE,70.0,1,2023-01-01T20:04:00.000Z,0.4,42.0,1,6,USD,$,FECO131-M-BLUE,Sale,Cash,77.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-001-03558762,2,10142,716,L,WHITE,26.0,1,2023-01-01T20:04:00.000Z,0.0,26.0,1,6,USD,$,MAT-716-L-WHITE,Sale,Cash,77.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {trans_silver_df.count()}")
trans_silver_df = trans_silver_df.dropDuplicates()
print(f"After deduplicate count : {trans_silver_df.count()}")

Before deduplicate count : 6416827
After deduplicate count : 6416029


### Filtering out null invoice_id, line, and customer_id rows

In [0]:
trans_silver_df = trans_silver_df.filter('(invoice_id is not null) and (line is not null) and (customer_id is not null)')

### Null Check

In [0]:
null_counts = trans_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in trans_silver_df.columns])
display(null_counts)

invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file
0,0,0,0,413049,4350231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Handling Null Values

In [0]:
trans_silver_df = trans_silver_df.fillna("UnKnown", ["size", "color"])

### Validating Nulls

In [0]:
null_counts = trans_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in trans_silver_df.columns])
display(null_counts)

invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_TRANSACTIONS_SCHEMA.keys())
incoming_cols = set(trans_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Transactions Table

In [0]:
if not spark.catalog.tableExists(SILVER_TRANSACTIONS_PATH):
    (
        trans_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_TRANSACTIONS_PATH)
    )
else:
    trans_silver_tbl = DeltaTable.forName(spark, SILVER_TRANSACTIONS_PATH)

    trans_silver_tbl.alias("tgt").merge(
        trans_silver_df.alias("src"),
        "tgt.invoice_id = src.invoice_id and tgt.line = src.line and tgt.customer_id = src.customer_id"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_TRANSACTIONS_PATH).limit(5).display()

invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file
INV-US-005-04340923,1,288057,13766,M,BLUE,79.5,3,2024-11-17T10:55:00.000Z,0.0,238.5,5,58,USD,$,MACO13766-M-BLUE,Sale,Credit Card,238.5,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-005-04341815,1,286620,12615,M,UnKnown,29.5,1,2024-11-24T08:44:00.000Z,0.0,29.5,5,57,USD,$,FET-12615-M-,Sale,Credit Card,29.5,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-005-04343433,1,269768,12092,S,YELLOW,50.0,1,2024-12-02T13:36:00.000Z,0.0,50.0,5,55,USD,$,FESW12092-S-YELLOW,Sale,Credit Card,147.5,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-005-04343613,1,291191,13540,UnKnown,UnKnown,35.0,1,2024-12-04T16:42:00.000Z,0.0,35.0,5,59,USD,$,MAAC13540--,Sale,Credit Card,35.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
INV-US-005-04344133,1,294592,14941,M,UnKnown,119.0,1,2024-12-06T09:40:00.000Z,0.0,119.0,5,59,USD,$,MASU14941-M-,Sale,Credit Card,119.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv


In [0]:
spark.read.table(SILVER_TRANSACTIONS_PATH).count()

6416029