### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_CUSTOMERS_PATH, SILVER_CUSTOMERS_PATH
from src.schema_definitions import SILVER_CUSTOMERS_SCHEMA

### Querying the Bronze Customers Table

In [0]:
cust_bronze_df = spark.read.table(BRONZE_CUSTOMERS_PATH)
cust_bronze_df.limit(5).display()

Customer ID,Name,Email,Telephone,City,Country,Gender,Date Of Birth,Job Title,ingestion_ts,_source_file
1,Tyler Garcia,tyler.garcia@fake_gmail.com,922.970.2265x47563,New York,United States,M,2003-07-15,,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
2,Joshua Miller,joshua.miller@fake_gmail.com,+1-958-729-6169,New York,United States,M,2000-06-16,Records manager,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
3,Alison Marshall DDS,alison.marshall.dds@fake_hotmail.com,+1-645-567-0876x5409,New York,United States,F,2003-07-22,,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
4,Jeffery Acosta,jeffery.acosta@fake_yahoo.com,212.336.0912x84994,New York,United States,M,1996-11-12,Proofreader,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
5,Ashley Sanders,ashley.sanders@fake_hotmail.com,7814535781,New York,United States,F,1998-02-10,Exercise physiologist,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv


### Silver Customers Schema Reference

In [0]:
SILVER_CUSTOMERS_SCHEMA

{'customer_id': 'integer',
 'name': 'string',
 'email': 'string',
 'telephone': 'string',
 'city': 'string',
 'country': 'string',
 'gender': 'string',
 'date_of_birth': 'date',
 'job_title': 'string'}

### Schema Enforcement and Column Name Standardizing

In [0]:
cust_silver_df = cust_bronze_df.select(
    col("Customer ID").cast(IntegerType()).alias("customer_id"),
    col("Name").cast(StringType()).alias("name"),
    col("Email").cast(StringType()).alias("email"),
    col("Telephone").cast(StringType()).alias("telephone"),
    col("City").cast(StringType()).alias("city"),
    col("Country").cast(StringType()).alias("country"),
    col("Gender").cast(StringType()).alias("gender"),  
    col("Date Of Birth").cast(DateType()).alias("date_of_birth"),
    col("Job Title").cast(StringType()).alias("job_title"),
    col("ingestion_ts").cast(TimestampType()),
    col("_source_file").cast(StringType())
)


### Cleaning Empty spaces and Formatting Values

In [0]:
cust_silver_df = (
    cust_silver_df.withColumn("name", initcap(trim(col("name"))))
    .withColumn("email", lower(trim(col("email"))))
    .withColumn("city", initcap(trim(col("city"))))
    .withColumn("country", upper(trim(col("country"))))
    .withColumn("gender", when((lower(trim(col("gender"))) == "m") | (lower(trim(col("gender"))) == "male"), "Male")
        .when((lower(trim(col("gender"))) == "f") | (lower(trim(col("gender"))) == "female"), "Female")
        .otherwise("Unknown"))
    .withColumn("job_title", initcap(trim(col("job_title"))))
)

In [0]:
cust_silver_df.limit(5).display()

customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,ingestion_ts,_source_file
1,Tyler Garcia,tyler.garcia@fake_gmail.com,922.970.2265x47563,New York,UNITED STATES,Male,2003-07-15,,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
2,Joshua Miller,joshua.miller@fake_gmail.com,+1-958-729-6169,New York,UNITED STATES,Male,2000-06-16,Records Manager,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
3,Alison Marshall Dds,alison.marshall.dds@fake_hotmail.com,+1-645-567-0876x5409,New York,UNITED STATES,Female,2003-07-22,,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
4,Jeffery Acosta,jeffery.acosta@fake_yahoo.com,212.336.0912x84994,New York,UNITED STATES,Male,1996-11-12,Proofreader,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
5,Ashley Sanders,ashley.sanders@fake_hotmail.com,7814535781,New York,UNITED STATES,Female,1998-02-10,Exercise Physiologist,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {cust_silver_df.count()}")
cust_silver_df = cust_silver_df.dropDuplicates(['customer_id', 'name', 'email'])
print(f"After deduplicate count : {cust_silver_df.count()}")

Before deduplicate count : 1643306
After deduplicate count : 1643306


### Filtering out null customer_id, name, and email rows

In [0]:
cust_silver_df = cust_silver_df.filter('(customer_id is not null) and (name is not null) and (email is not null)')

### validating the primary Key

In [0]:
cust_silver_df.groupBy("customer_id").agg(count('*').alias("count")).filter('count > 1').display()

customer_id,count


### Null Check

In [0]:
null_counts = cust_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in cust_silver_df.columns])
display(null_counts)

customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,584185,0,0


### Handling Null Values

In [0]:
cust_silver_df = cust_silver_df.fillna("UnKnown", ["city", "country",  "gender", "job_title"])

### Validating Nulls

In [0]:
null_counts = cust_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in cust_silver_df.columns])
display(null_counts)

customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,0,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_CUSTOMERS_SCHEMA.keys())
incoming_cols = set(cust_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Customers Table

In [0]:
if not spark.catalog.tableExists(SILVER_CUSTOMERS_PATH):
    (
        cust_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_CUSTOMERS_PATH)
    )
else:
    cust_silver_tbl = DeltaTable.forName(spark, SILVER_CUSTOMERS_PATH)

    cust_silver_tbl.alias("tgt").merge(
        cust_silver_df.alias("src"),
        "tgt.customer_id = src.customer_id"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_CUSTOMERS_PATH).limit(5).display()

customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,ingestion_ts,_source_file
94,Melissa Porter,melissa.porter@fake_hotmail.com,001-915-876-4475x65903,New York,UNITED STATES,Female,1977-08-23,Geophysicist/field Seismologist,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
586,Daniel Gallegos,daniel.gallegos@fake_gmail.com,(609)511-6278x1772,New York,UNITED STATES,Male,2003-04-05,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
1230,Catherine Mcmillan,catherine.mcmillan@fake_yahoo.com,372-603-6029,New York,UNITED STATES,Female,1992-09-11,Drilling Engineer,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
2060,Kevin White,kevin.white@fake_yahoo.com,+1-267-803-3725x942,New York,UNITED STATES,Male,2004-01-15,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
3668,David Brady,david.brady@fake_hotmail.com,(991)949-5547,New York,UNITED STATES,Male,2003-01-27,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv


In [0]:
spark.read.table(SILVER_CUSTOMERS_PATH).count()

1643306