### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
dbutils.library.restartPython()


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_PRODUCTS_PATH, DIM_PRODUCTS_PATH
from src.schema_definitions import DIM_PRODUCTS_SCHEMA
from src.utils import add_gold_metadata
from delta.tables import DeltaTable

### Querying Silver Products Table

In [0]:
prod_silver_df = spark.read.table(SILVER_PRODUCTS_PATH)
prod_silver_df.limit(5).display()

### Dim_products Schema Reference

In [0]:
DIM_PRODUCTS_SCHEMA

### Selecting the Needed columns for dim_products

In [0]:
prod_silver_df = prod_silver_df.select("product_id", "category", "sub_category", "color", "sizes", "production_cost", "description_PT", "description_DE", "description_FR", "description_ES", "description_EN", "description_ZH")

### Creating metadata columns : _created_at and _updated_at

In [0]:
dim_prod_df = add_gold_metadata(prod_silver_df)

### Creating Dim_products Table with surrogate key

In [0]:
spark.sql(f"""
        create table if not exists {DIM_PRODUCTS_PATH}(
            product_sk long generated always as identity,
            product_id integer,
            category string,
            sub_category string,
            color string,
            sizes string,
            production_cost double,
            description_PT string,
            description_DE string,
            description_FR string,
            description_ES string,
            description_EN string,
            description_ZH string,
            _created_at timestamp,
            _updated_at timestamp
        ) using delta
    """)

### Updating the Dim_products Table

In [0]:
dim_prod_tbl = DeltaTable.forName(spark, DIM_PRODUCTS_PATH)

dim_prod_tbl.alias("tgt").merge(
    dim_prod_df.alias("src"),
    "tgt.product_id = src.product_id"
).whenMatchedUpdate(set={
    "tgt.product_id":"src.product_id", 
    "tgt.category":"src.category",
    "tgt.sub_category":"src.sub_category",
    "tgt.color":"src.color",
    "tgt.sizes":"src.sizes",
    "tgt.production_cost":"src.production_cost",
    "tgt.description_PT":"src.description_PT",
    "tgt.description_DE":"src.description_DE",
    "tgt.description_FR":"src.description_FR",
    "tgt.description_ES":"src.description_ES",
    "tgt.description_EN":"src.description_EN",
    "tgt.description_ZH":"src.description_ZH",
    "tgt._updated_at": "src._updated_at"
}).whenNotMatchedInsert(values={
    "tgt.product_id":"src.product_id", 
    "tgt.category":"src.category",
    "tgt.sub_category":"src.sub_category",
    "tgt.color":"src.color",
    "tgt.sizes":"src.sizes",
    "tgt.production_cost":"src.production_cost",
    "tgt.description_PT":"src.description_PT",
    "tgt.description_DE":"src.description_DE",
    "tgt.description_FR":"src.description_FR",
    "tgt.description_ES":"src.description_ES",
    "tgt.description_EN":"src.description_EN",
    "tgt.description_ZH":"src.description_ZH",
    "tgt._created_at":"src._created_at",
    "tgt._updated_at":"src._updated_at"
}).execute()

In [0]:
spark.read.table(DIM_PRODUCTS_PATH).limit(5).display()

In [0]:
spark.read.table(DIM_PRODUCTS_PATH).count()