### Importing the Needed Modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_STORES_PATH, DIM_STORES_PATH
from src.schema_definitions import DIM_STORES_SCHEMA
from src.utils import add_gold_metadata
from delta.tables import DeltaTable

### Querying Silver Stores Table

In [0]:
store_silver_df = spark.read.table(SILVER_STORES_PATH)
store_silver_df.limit(5).display()

store_id,country,city,store_name,number_of_employees,zip_code,latitude,longitude,ingestion_ts,_source_file
1,UNITED STATES,New York,Store New York,10,10001,40.7128,-74.006,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
2,UNITED STATES,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
3,UNITED STATES,Chicago,Store Chicago,9,60601,41.8781,-87.6298,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
4,UNITED STATES,Houston,Store Houston,10,77001,29.7604,-95.3698,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv
5,UNITED STATES,Phoenix,Store Phoenix,9,85001,33.4484,-112.074,2026-01-14T05:51:11.556Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv


### Dim_stores Schema Reference

In [0]:
DIM_STORES_SCHEMA

{'store_sk': 'long',
 'store_id': 'integer',
 'store_name': 'string',
 'city': 'string',
 'country': 'string',
 'zip_code': 'string',
 'latitude': 'double',
 'longitude': 'double',
 'number_of_employees': 'integer',
 '_created_at': 'timestamp',
 '_updated_at': 'timestamp'}

### Selecting the Needed columns for dim_stores

In [0]:
store_silver_df = store_silver_df.select("store_id", "store_name", "city", "country", "zip_code", "latitude", "longitude", "number_of_employees")

### Creating metadata columns : _created_at and _updated_at

In [0]:
dim_store_df = add_gold_metadata(store_silver_df)

### Creating Dim_stores Table with surrogate key

In [0]:
spark.sql(f"""
        create table if not exists {DIM_STORES_PATH}(
            store_sk long generated always as identity,
            store_id integer,
            store_name string,
            city string,
            country string,
            zip_code string,
            latitude double,
            longitude double,
            number_of_employees integer,
            _created_at timestamp,
            _updated_at timestamp
        ) using delta
    """)

DataFrame[]

### Updating the Dim_stores Table

In [0]:
dim_store_tbl = DeltaTable.forName(spark, DIM_STORES_PATH)

dim_store_tbl.alias("tgt").merge(
    dim_store_df.alias("src"),
    "tgt.store_id = src.store_id"
).whenMatchedUpdate(set={
    "tgt.store_name": "src.store_name",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.zip_code": "src.zip_code",
    "tgt.latitude": "src.latitude",
    "tgt.longitude": "src.longitude",
    "tgt.number_of_employees": "src.number_of_employees",
    "tgt._updated_at": "src._updated_at"
}).whenNotMatchedInsert(values={
    "tgt.store_id": "src.store_id",
    "tgt.store_name": "src.store_name",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.zip_code": "src.zip_code",
    "tgt.latitude": "src.latitude",
    "tgt.longitude": "src.longitude",
    "tgt.number_of_employees": "src.number_of_employees",
    "tgt._created_at": "src._created_at",
    "tgt._updated_at": "src._updated_at"
}).execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table(DIM_STORES_PATH).limit(5).display()

store_sk,store_id,store_name,city,country,zip_code,latitude,longitude,number_of_employees,_created_at,_updated_at
1,1,Store New York,New York,UNITED STATES,10001,40.7128,-74.006,10,2026-01-17T12:57:20.054Z,2026-01-17T12:57:20.054Z
2,2,Store Los Angeles,Los Angeles,UNITED STATES,90001,34.0522,-118.2437,8,2026-01-17T12:57:20.054Z,2026-01-17T12:57:20.054Z
3,3,Store Chicago,Chicago,UNITED STATES,60601,41.8781,-87.6298,9,2026-01-17T12:57:20.054Z,2026-01-17T12:57:20.054Z
4,4,Store Houston,Houston,UNITED STATES,77001,29.7604,-95.3698,10,2026-01-17T12:57:20.054Z,2026-01-17T12:57:20.054Z
5,5,Store Phoenix,Phoenix,UNITED STATES,85001,33.4484,-112.074,9,2026-01-17T12:57:20.054Z,2026-01-17T12:57:20.054Z


In [0]:
spark.read.table(DIM_STORES_PATH).count()

35