### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import   BRONZE_TRANSACTIONS_PATH, SILVER_TRANSACTIONS_PATH
from src.schema_definitions import SILVER_TRANSACTIONS_SCHEMA

### Querying the Bronze Transactions Table

In [0]:
trans_bronze_df = spark.read.table(BRONZE_TRANSACTIONS_PATH)
trans_bronze_df.limit(5).display()

### Silver Transactions Schema Reference

In [0]:
SILVER_TRANSACTIONS_SCHEMA

### Schema Enforecement and Column Name Standardizing

In [0]:
trans_silver_df = trans_bronze_df.select(
    col("Invoice ID").cast(StringType()).alias("invoice_id"),
    col("Line").cast(IntegerType()).alias("line"),
    col("Customer ID").cast(IntegerType()).alias("customer_id"),
    col("Product ID").cast(IntegerType()).alias("product_id"),
    col("Size").cast(StringType()).alias("size"),
    col("Color").cast(StringType()).alias("color"),
    col("Unit Price").cast(DoubleType()).alias("unit_price"),
    col("Quantity").cast(IntegerType()).alias("quantity"),
    col("Date").cast(TimestampType()).alias("date"),
    col("Discount").cast(DoubleType()).alias("discount"),
    col("Line Total").cast(DoubleType()).alias("line_total"),
    col("Store ID").cast(IntegerType()).alias("store_id"),
    col("Employee ID").cast(IntegerType()).alias("employee_id"),
    col("Currency").cast(StringType()).alias("currency"),
    col("Currency Symbol").cast(StringType()).alias("currency_symbol"),
    col("SKU").cast(StringType()).alias("sku"),
    col("Transaction Type").cast(StringType()).alias("transaction_type"),
    col("Payment Method").cast(StringType()).alias("payment_method"),
    col("Invoice Total").cast(DoubleType()).alias("invoice_total"),
    col("ingestion_ts"),
    col("_source_file")
)

### Cleaning Empty spaces and Formatting Values

In [0]:
trans_silver_df = (
    trans_silver_df.withColumn("invoice_id", trim(col("invoice_id")))
    .withColumn("size", upper(trim(col("size"))))
    .withColumn("color", upper(trim(col("color"))))
    .withColumn("currency", upper(trim(col("currency"))))
    .withColumn("currency_symbol", trim(col("currency_symbol")))   
    .withColumn("sku", upper(trim(col("sku"))))
    .withColumn("transaction_type", initcap(trim(col("transaction_type"))))
    .withColumn("payment_method", initcap(trim(col("payment_method"))))
)

In [0]:
trans_silver_df.limit(5).display()


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {trans_silver_df.count()}")
trans_silver_df = trans_silver_df.dropDuplicates()
print(f"After deduplicate count : {trans_silver_df.count()}")

### Filtering out null invoice_id, line, and customer_id rows

In [0]:
trans_silver_df = trans_silver_df.filter('(invoice_id is not null) and (line is not null) and (customer_id is not null)')

### Null Check

In [0]:
null_counts = trans_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in trans_silver_df.columns])
display(null_counts)

### Handling Null Values

In [0]:
trans_silver_df = trans_silver_df.fillna("UnKnown", ["size", "color"])

### Validating Nulls

In [0]:
null_counts = trans_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in trans_silver_df.columns])
display(null_counts)

### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_TRANSACTIONS_SCHEMA.keys())
incoming_cols = set(trans_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


### Creating or Updating Silver Transactions Table

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# -----------------------------
# Deduplicate source data
# -----------------------------
window_spec = Window.partitionBy(
    "invoice_id", "line", "customer_id"
).orderBy(col("date").desc())

trans_silver_dedup_df = (
    trans_silver_df
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn")
)

# -----------------------------
# Create or Merge into Silver table
# -----------------------------
if not spark.catalog.tableExists(SILVER_TRANSACTIONS_PATH):
    (
        trans_silver_dedup_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_TRANSACTIONS_PATH)
    )
else:
    trans_silver_tbl = DeltaTable.forName(spark, SILVER_TRANSACTIONS_PATH)

    (
        trans_silver_tbl.alias("tgt")
        .merge(
            trans_silver_dedup_df.alias("src"),
            """
            tgt.invoice_id = src.invoice_id
            AND tgt.line = src.line
            AND tgt.customer_id = src.customer_id
            """
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )


In [0]:
spark.read.table(SILVER_TRANSACTIONS_PATH).limit(5).display()

In [0]:
spark.read.table(SILVER_TRANSACTIONS_PATH).count()