### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
import sys

REPO_ROOT = "/Workspace/Repos/thiruvengadamk16/Retail-And-Ecommerce-Analytics-Platform"

if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

print(sys.path[0])


In [0]:
import importlib.util
print(importlib.util.find_spec("src"))


In [0]:
dbutils.library.restartPython()


In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
import src
print("src imported")

import src.paths
print("paths imported")

from src.paths import BRONZE_PRODUCTS_PATH, SILVER_PRODUCTS_PATH
print("variables imported")


In [0]:
from src.paths import BRONZE_PRODUCTS_PATH, SILVER_PRODUCTS_PATH


In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_PRODUCTS_PATH, SILVER_PRODUCTS_PATH
from src.schema_definitions import SILVER_PRODUCTS_SCHEMA

### Querying the Bronze Products Table

In [0]:
prod_bronze_df = spark.read.table(BRONZE_PRODUCTS_PATH)
prod_bronze_df.limit(5).display()

### Silver Products Schema Reference

In [0]:
SILVER_PRODUCTS_SCHEMA

### Schema Enforcement and Column Name Standardizing

In [0]:
prod_silver_df = prod_bronze_df.select(
    col("Product ID").cast(IntegerType()).alias("product_id"),
    col("Category").cast(StringType()).alias("category"),
    col("Sub Category").cast(StringType()).alias("sub_category"),
    col("Description PT").cast(StringType()).alias("description_PT"),
    col("Description DE").cast(StringType()).alias("description_DE"),
    col("Description FR").cast(StringType()).alias("description_FR"),
    col("Description ES").cast(StringType()).alias("description_ES"),  
    col("Description EN").cast(StringType()).alias("description_EN"),
    col("Description ZH").cast(StringType()).alias("description_ZH"),
    col("Color").cast(StringType()).alias("color"),
    col("Sizes").cast(StringType()).alias("sizes"),
    col("Production Cost").cast(DoubleType()).alias("production_cost"),
    col("ingestion_ts"),
    col("_source_file")
)


### Cleaning Empty spaces and Formatting values

In [0]:
prod_silver_df = (
    prod_silver_df.withColumn("category", initcap(trim(col("category"))))
    .withColumn("sub_category", initcap(trim(col("sub_category"))))
    .withColumn("description_PT", trim(col("description_PT")))
    .withColumn("description_DE", trim(col("description_DE")))
    .withColumn("description_FR", trim(col("description_FR")))
    .withColumn("description_ES", trim(col("description_ES")))
    .withColumn("description_EN", trim(col("description_EN")))
    .withColumn("description_ZH", trim(col("description_ZH")))
    .withColumn("color", upper(trim(col("color"))))
    .withColumn("sizes", upper(trim(col("sizes"))))
)

In [0]:
prod_silver_df.limit(5).display()

### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {prod_silver_df.count()}")
prod_silver_df = prod_silver_df.dropDuplicates()
print(f"After deduplicate count : {prod_silver_df.count()}")

### Filtering out null product_id, category, and sub_category rows

In [0]:
prod_silver_df = prod_silver_df.filter('(product_id is not null) and (category is not null) and (sub_category is not null)')

### validating the primary Key

In [0]:
prod_silver_df.groupBy("product_id").agg(count('*').alias("count")).filter('count > 1').display()

### Null Check

In [0]:
null_counts = prod_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in prod_silver_df.columns])
display(null_counts)

### Handling Null Values

In [0]:
prod_silver_df = prod_silver_df.fillna("UnKnown", ["color", "sizes"])
prod_silver_df = prod_silver_df.fillna("Not Available", ["description_PT", "description_DE", "description_FR", "description_ES", "description_EN", "description_ZH"])

### Validating Nulls

In [0]:
null_counts = prod_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in prod_silver_df.columns])
display(null_counts)

### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_PRODUCTS_SCHEMA.keys())
incoming_cols = set(prod_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


### Creating or Updating Silver Products Table

In [0]:
if not spark.catalog.tableExists(SILVER_PRODUCTS_PATH):
    (
        prod_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_PRODUCTS_PATH)
    )
else:
    prod_silver_tbl = DeltaTable.forName(spark, SILVER_PRODUCTS_PATH)

    prod_silver_tbl.alias("tgt").merge(
        prod_silver_df.alias("src"),
        "tgt.product_id = src.product_id"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_PRODUCTS_PATH).limit(5).display()

In [0]:
spark.read.table(SILVER_PRODUCTS_PATH).count()