### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_EMPLOYEES_PATH, SILVER_EMPLOYEES_PATH
from src.schema_definitions import SILVER_EMPLOYEES_SCHEMA

### Querying the Bronze Employees Table

In [0]:
emp_bronze_df = spark.read.table(BRONZE_EMPLOYEES_PATH)
emp_bronze_df.limit(5).display()

Employee ID,Store ID,Name,Position,ingestion_ts,_source_file
1,1,Stephen Johnson,Store Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
2,1,Rebecca Myers,Assistant Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
3,1,Katherine Buchanan,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
4,1,Jessica Hicks,Stock Clerk,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
5,1,Ryan Gross,Sales Associate,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv


### Silver Employees Schema Reference

In [0]:
SILVER_EMPLOYEES_SCHEMA

{'employee_id': 'integer',
 'store_id': 'integer',
 'name': 'string',
 'position': 'string'}

### Schema Enforcement and Column Name Standardizing

In [0]:
emp_silver_df = emp_bronze_df.select(
    col("Employee ID").cast(IntegerType()).alias("employee_id"),
    col("Store ID").cast(IntegerType()).alias("store_id"),
    col("Name").cast(StringType()).alias("name"),
    col("Position").cast(StringType()).alias("position"),
    col("ingestion_ts"),
    col("_source_file")
)

### Cleaning Empty spaces and Formatting values

In [0]:
emp_silver_df = (
    emp_silver_df.withColumn("name", initcap(trim(col("name"))))
    .withColumn("position", initcap(trim(col("position"))))
)

In [0]:
emp_silver_df.limit(5).display()

employee_id,store_id,name,position,ingestion_ts,_source_file
1,1,Stephen Johnson,Store Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
2,1,Rebecca Myers,Assistant Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
3,1,Katherine Buchanan,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
4,1,Jessica Hicks,Stock Clerk,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
5,1,Ryan Gross,Sales Associate,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {emp_silver_df.count()}")
emp_silver_df = emp_silver_df.dropDuplicates()
print(f"After deduplicate count : {emp_silver_df.count()}")

Before deduplicate count : 404
After deduplicate count : 404


### Filtering out null employee_id, store_id, and name rows

In [0]:
emp_silver_df = emp_silver_df.filter('(employee_id is not null) and (store_id is not null) and (name is not null)')

### validating the primary Key

In [0]:
emp_silver_df.groupBy("employee_id").agg(count('*').alias("count")).filter('count > 1').display()

employee_id,count


### Null Check

In [0]:
null_counts = emp_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in emp_silver_df.columns])
display(null_counts)

employee_id,store_id,name,position,ingestion_ts,_source_file
0,0,0,0,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_EMPLOYEES_SCHEMA.keys())
incoming_cols = set(emp_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Employees Table

In [0]:
if not spark.catalog.tableExists(SILVER_EMPLOYEES_PATH):
    (
        emp_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_EMPLOYEES_PATH)
    )
else:
    cust_silver_tbl = DeltaTable.forName(spark, SILVER_EMPLOYEES_PATH)

    cust_silver_tbl.alias("tgt").merge(
        emp_silver_df.alias("src"),
        "tgt.employee_id = src.employee_id"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_EMPLOYEES_PATH).limit(5).display()

employee_id,store_id,name,position,ingestion_ts,_source_file
159,13,Rosina Albers,Sales Associate,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
197,17,Richard Parks,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
362,32,Soraia Batista-maia,Assistant Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
397,35,Teresa Freitas,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
210,18,Keith Small,Stock Clerk,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv


In [0]:
spark.read.table(SILVER_EMPLOYEES_PATH).count()

404