### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_CUSTOMERS_PATH, DIM_CUSTOMERS_PATH
from src.schema_definitions import DIM_CUSTOMERS_SCHEMA
from src.utils import add_gold_metadata
from delta.tables import DeltaTable

### Querying Silver Customers Table

In [0]:
cust_silver_df = spark.read.table(SILVER_CUSTOMERS_PATH)
cust_silver_df.limit(5).display()

### Dim_customers Schema Reference

In [0]:
DIM_CUSTOMERS_SCHEMA

### Selecting the Needed columns for dim_customers

In [0]:
cust_silver_df = cust_silver_df.select("customer_id", "name", "email", "telephone", "city", "country", "gender", "date_of_birth", "job_title")

### Creating metadata columns : _created_at and _updated_at

In [0]:
dim_cust_df = add_gold_metadata(cust_silver_df)

### Creating Dim_customers Table with surrogate key

In [0]:
spark.sql(f"""
        create table if not exists {DIM_CUSTOMERS_PATH}(
            customer_sk long generated always as identity,
            customer_id integer,
            name string,
            email string,
            telephone string,
            city string,
            country string,
            gender string,
            date_of_birth date,
            job_title string,
            _created_at timestamp,
            _updated_at timestamp
        ) using delta
    """)

### Updating the Dim_customers Table

In [0]:
dim_cust_tbl = DeltaTable.forName(spark, DIM_CUSTOMERS_PATH)

dim_cust_tbl.alias("tgt").merge(
    dim_cust_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenMatchedUpdate(set={
    "tgt.name": "src.name",
    "tgt.email": "src.email",
    "tgt.telephone": "src.telephone",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.gender": "src.gender",
    "tgt.date_of_birth": "src.date_of_birth",
    "tgt.job_title": "src.job_title",
    "tgt._updated_at": "src._updated_at"
}).whenNotMatchedInsert(values={
    "tgt.customer_id": "src.customer_id",
    "tgt.name": "src.name",
    "tgt.email": "src.email",
    "tgt.telephone": "src.telephone",
    "tgt.city": "src.city",
    "tgt.country": "src.country",
    "tgt.gender": "src.gender",
    "tgt.date_of_birth": "src.date_of_birth",
    "tgt.job_title": "src.job_title",
    "tgt._created_at": "src._created_at",
    "tgt._updated_at": "src._updated_at"
}).execute()

In [0]:
spark.read.table(DIM_CUSTOMERS_PATH).limit(5).display()

In [0]:
spark.read.table(DIM_CUSTOMERS_PATH).count()