### Importing the Needed Modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_EMPLOYEES_PATH, DIM_EMPLOYEES_PATH
from src.schema_definitions import DIM_EMPLOYEES_SCHEMA
from src.utils import add_gold_metadata
from delta.tables import DeltaTable

### Querying Silver Employees Table

In [0]:
emp_silver_df = spark.read.table(SILVER_EMPLOYEES_PATH)
emp_silver_df.limit(5).display()

employee_id,store_id,name,position,ingestion_ts,_source_file
159,13,Rosina Albers,Sales Associate,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
197,17,Richard Parks,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
362,32,Soraia Batista-maia,Assistant Manager,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
397,35,Teresa Freitas,Cashier,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv
210,18,Keith Small,Stock Clerk,2026-01-14T05:32:03.286Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv


### Dim_employees Schema Reference

In [0]:
DIM_EMPLOYEES_SCHEMA

{'employee_sk': 'long',
 'employee_id': 'integer',
 'store_id': 'integer',
 'name': 'string',
 'position': 'string',
 '_created_at': 'timestamp',
 '_updated_at': 'timestamp'}

### Selecting the Needed columns for dim_employees

In [0]:
emp_silver_df = emp_silver_df.select("employee_id", "store_id", "name", "position")

### Creating metadata columns : _created_at and _updated_at

In [0]:
dim_emp_df = add_gold_metadata(emp_silver_df)

### Creating Dim_employees Table with surrogate key

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {DIM_EMPLOYEES_PATH} (
        employee_sk LONG GENERATED ALWAYS AS IDENTITY,
        employee_id INT,
        store_id INT,
        name STRING,
        position STRING,
        _created_at TIMESTAMP,
        _updated_at TIMESTAMP
    )
    USING DELTA
""")

DataFrame[]

### Updating the Dim_employees Table

In [0]:
dim_employee_tbl = DeltaTable.forName(spark, DIM_EMPLOYEES_PATH)

dim_employee_tbl.alias("tgt").merge(
    dim_emp_df.alias("src"),
    "tgt.employee_id = src.employee_id"
).whenMatchedUpdate(set={
    "tgt.store_id": "src.store_id",
    "tgt.name": "src.name",
    "tgt.position": "src.position",
    "tgt._updated_at": "src._updated_at"
}).whenNotMatchedInsert(values={
    "tgt.employee_id": "src.employee_id",
    "tgt.store_id": "src.store_id",
    "tgt.name": "src.name",
    "tgt.position": "src.position",
    "tgt._created_at": "src._created_at",
    "tgt._updated_at": "src._updated_at"
}).execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table(DIM_EMPLOYEES_PATH).limit(5).display()

employee_sk,employee_id,store_id,name,position,_created_at,_updated_at
1,159,13,Rosina Albers,Sales Associate,2026-01-17T13:11:32.398Z,2026-01-17T13:11:32.398Z
2,197,17,Richard Parks,Cashier,2026-01-17T13:11:32.398Z,2026-01-17T13:11:32.398Z
3,362,32,Soraia Batista-maia,Assistant Manager,2026-01-17T13:11:32.398Z,2026-01-17T13:11:32.398Z
4,397,35,Teresa Freitas,Cashier,2026-01-17T13:11:32.398Z,2026-01-17T13:11:32.398Z
5,210,18,Keith Small,Stock Clerk,2026-01-17T13:11:32.398Z,2026-01-17T13:11:32.398Z


In [0]:
spark.read.table(DIM_EMPLOYEES_PATH).count()

404