### Importing the Needed Modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_CUSTOMERS_PATH, DIM_CUSTOMERS_PATH
from src.schema_definitions import DIM_CUSTOMERS_SCHEMA
from src.utils import add_gold_metadata
from delta.tables import DeltaTable

### Querying Silver Customers Table

In [0]:
cust_silver_df = spark.read.table(SILVER_CUSTOMERS_PATH)
cust_silver_df.limit(5).display()

customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,ingestion_ts,_source_file
94,Melissa Porter,melissa.porter@fake_hotmail.com,001-915-876-4475x65903,New York,UNITED STATES,Female,1977-08-23,Geophysicist/field Seismologist,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
586,Daniel Gallegos,daniel.gallegos@fake_gmail.com,(609)511-6278x1772,New York,UNITED STATES,Male,2003-04-05,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
1230,Catherine Mcmillan,catherine.mcmillan@fake_yahoo.com,372-603-6029,New York,UNITED STATES,Female,1992-09-11,Drilling Engineer,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
2060,Kevin White,kevin.white@fake_yahoo.com,+1-267-803-3725x942,New York,UNITED STATES,Male,2004-01-15,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv
3668,David Brady,david.brady@fake_hotmail.com,(991)949-5547,New York,UNITED STATES,Male,2003-01-27,UnKnown,2026-01-14T05:29:25.815Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv


### Dim_customers Schema Reference

In [0]:
DIM_CUSTOMERS_SCHEMA

{'customer_sk': 'long',
 'customer_id': 'integer',
 'name': 'string',
 'email': 'string',
 'telephone': 'string',
 'city': 'string',
 'country': 'string',
 'gender': 'string',
 'date_of_birth': 'date',
 'job_title': 'string',
 '_created_at': 'timestamp',
 '_updated_at': 'timestamp'}

### Selecting the Needed columns for dim_customers

In [0]:
cust_silver_df = cust_silver_df.select("customer_id", "name", "email", "telephone", "city", "country", "gender", "date_of_birth", "job_title")

### Creating metadata columns : _created_at and _updated_at

In [0]:
dim_cust_df = add_gold_metadata(cust_silver_df)

### Creating Dim_customers Table with surrogate key

In [0]:
spark.sql(f"""
        create table if not exists {DIM_CUSTOMERS_PATH}(
            customer_sk long generated always as identity,
            customer_id integer,
            name string,
            email string,
            telephone string,
            city string,
            country string,
            gender string,
            date_of_birth date,
            job_title string,
            _created_at timestamp,
            _updated_at timestamp
        ) using delta
    """)

DataFrame[]

### Updating the Dim_customers Table

In [0]:
dim_cust_tbl = DeltaTable.forName(spark, DIM_CUSTOMERS_PATH)

dim_cust_tbl.alias("tgt").merge(
    dim_cust_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenMatchedUpdate(set={
    "tgt.name": "src.name",
    "tgt.email": "src.email",
    "tgt.telephone": "src.telephone",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.gender": "src.gender",
    "tgt.date_of_birth": "src.date_of_birth",
    "tgt.job_title": "src.job_title",
    "tgt._updated_at": "src._updated_at"
}).whenNotMatchedInsert(values={
    "tgt.customer_id": "src.customer_id",
    "tgt.name": "src.name",
    "tgt.email": "src.email",
    "tgt.telephone": "src.telephone",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.gender": "src.gender",
    "tgt.date_of_birth": "src.date_of_birth",
    "tgt.job_title": "src.job_title",
    "tgt._created_at": "src._created_at",
    "tgt._updated_at": "src._updated_at"
}).execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table(DIM_CUSTOMERS_PATH).limit(5).display()

customer_sk,customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,_created_at,_updated_at
1,94,Melissa Porter,melissa.porter@fake_hotmail.com,001-915-876-4475x65903,New York,UNITED STATES,Female,1977-08-23,Geophysicist/field Seismologist,2026-01-17T11:42:27.164Z,2026-01-17T11:42:27.164Z
2,586,Daniel Gallegos,daniel.gallegos@fake_gmail.com,(609)511-6278x1772,New York,UNITED STATES,Male,2003-04-05,UnKnown,2026-01-17T11:42:27.164Z,2026-01-17T11:42:27.164Z
3,1230,Catherine Mcmillan,catherine.mcmillan@fake_yahoo.com,372-603-6029,New York,UNITED STATES,Female,1992-09-11,Drilling Engineer,2026-01-17T11:42:27.164Z,2026-01-17T11:42:27.164Z
4,2060,Kevin White,kevin.white@fake_yahoo.com,+1-267-803-3725x942,New York,UNITED STATES,Male,2004-01-15,UnKnown,2026-01-17T11:42:27.164Z,2026-01-17T11:42:27.164Z
5,3668,David Brady,david.brady@fake_hotmail.com,(991)949-5547,New York,UNITED STATES,Male,2003-01-27,UnKnown,2026-01-17T11:42:27.164Z,2026-01-17T11:42:27.164Z


In [0]:
spark.read.table(DIM_CUSTOMERS_PATH).count()

1643306