### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_STORES_PATH, SILVER_STORES_PATH
from src.schema_definitions import SILVER_STORES_SCHEMA

### Querying the Bronze Stores Table

In [0]:
stores_bronze_df = spark.read.table(BRONZE_STORES_PATH)
stores_bronze_df.limit(5).display()

Store ID,Country,City,Store Name,Number of Employees,ZIP Code,Latitude,Longitude,ingestion_ts,_source_file
1,United States,New York,Store New York,10,10001,40.7128,-74.006,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
2,United States,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
3,United States,Chicago,Store Chicago,9,60601,41.8781,-87.6298,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
4,United States,Houston,Store Houston,10,77001,29.7604,-95.3698,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
5,United States,Phoenix,Store Phoenix,9,85001,33.4484,-112.074,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv


### Silver Stores Schema Reference

In [0]:
SILVER_STORES_SCHEMA

{'store_id': 'integer',
 'country': 'string',
 'city': 'string',
 'store_name': 'string',
 'number_of_employees': 'integer',
 'zip_code': 'string',
 'latitude': 'double',
 'longitude': 'double'}

### Schema Enforecement and Column Name Standardizing

In [0]:
stores_silver_df = stores_bronze_df.select(
    col("Store ID").cast(IntegerType()).alias("store_id"),
    col("Country").cast(StringType()).alias("country"),
    col("City").cast(StringType()).alias("city"),
    col("Store Name").cast(StringType()).alias("store_name"),
    col("Number of Employees").cast(IntegerType()).alias("number_of_employees"),
    col("ZIP Code").cast(StringType()).alias("zip_code"),
    col("Latitude").cast(DoubleType()).alias("latitude"),
    col("Longitude").cast(DoubleType()).alias("longitude"),
    col("ingestion_ts"),
    col("_source_file")
)


### Cleaning Empty spaces and Formatting values

In [0]:
stores_silver_df = (
    stores_silver_df.withColumn("country", upper(trim(col("country"))))
    .withColumn("city", initcap(trim(col("city"))))
    .withColumn("store_name", initcap(trim(col("store_name"))))
    .withColumn("zip_code", trim(col("zip_code")))
)

In [0]:
stores_silver_df.limit(5).display()

store_id,country,city,store_name,number_of_employees,zip_code,latitude,longitude,ingestion_ts,_source_file
1,UNITED STATES,New York,Store New York,10,10001,40.7128,-74.006,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
2,UNITED STATES,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
3,UNITED STATES,Chicago,Store Chicago,9,60601,41.8781,-87.6298,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
4,UNITED STATES,Houston,Store Houston,10,77001,29.7604,-95.3698,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
5,UNITED STATES,Phoenix,Store Phoenix,9,85001,33.4484,-112.074,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {stores_silver_df.count()}")
stores_silver_df = stores_silver_df.dropDuplicates(["store_id", "country"])
print(f"After deduplicate count : {stores_silver_df.count()}")

Before deduplicate count : 35
After deduplicate count : 35


### Filtering out null store_id, country, and store_name rows

In [0]:
stores_silver_df = stores_silver_df.filter('(store_id is not null) and (country is not null) and (store_name is not null)')

### validating the primary Key

In [0]:
stores_silver_df.groupBy("store_id").agg(count('*').alias("count")).filter('count > 1').display()

store_id,count


### Null Check

In [0]:
null_counts = stores_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in stores_silver_df.columns])
display(null_counts)

store_id,country,city,store_name,number_of_employees,zip_code,latitude,longitude,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_STORES_SCHEMA.keys())
incoming_cols = set(stores_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Stores Table

In [0]:
if not spark.catalog.tableExists(SILVER_STORES_PATH):
    (
        stores_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_STORES_PATH)
    )
else:
    stores_silver_tbl = DeltaTable.forName(spark, SILVER_STORES_PATH)

    stores_silver_tbl.alias("tgt").merge(
        stores_silver_df.alias("src"),
        "tgt.store_id = src.store_id "
        "AND tgt.country = src.country"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_STORES_PATH).limit(5).display()

store_id,country,city,store_name,number_of_employees,zip_code,latitude,longitude,ingestion_ts,_source_file
1,UNITED STATES,New York,Store New York,10,10001,40.7128,-74.006,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
2,UNITED STATES,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
3,UNITED STATES,Chicago,Store Chicago,9,60601,41.8781,-87.6298,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
4,UNITED STATES,Houston,Store Houston,10,77001,29.7604,-95.3698,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
5,UNITED STATES,Phoenix,Store Phoenix,9,85001,33.4484,-112.074,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv


In [0]:
spark.read.table(SILVER_STORES_PATH).count()

35