### Importing the Needed Modules

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")

from src.paths import BRONZE_PRODUCTS_PATH, SILVER_PRODUCTS_PATH
from src.schema_definitions import SILVER_PRODUCTS_SCHEMA

### Querying the Bronze Products Table

In [0]:
prod_bronze_df = spark.read.table(BRONZE_PRODUCTS_PATH)
prod_bronze_df.limit(5).display()

Product ID,Category,Sub Category,Description PT,Description DE,Description FR,Description ES,Description EN,Description ZH,Color,Sizes,Production Cost,ingestion_ts,_source_file
1,Feminine,Coats and Blazers,Esportivo Veludo Verde Com Botões,Sport Samt Sport Mit Knöpfen,Sports Velvet Sports Avec Des Boutons,Deportes De Terciopelo Con Botones,Sports Velvet Sports With Buttons,运动天鹅绒运动与按钮,,S|M|L|XL,10.73,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
2,Feminine,Sweaters and Knitwear,Luxuoso Denim Rosa Com Botões,Luxuriöser Rosa Jeans Mit Knöpfen,Léchard De Denim Rose Avec Boutons,Denim Rosa Lujoso Con Botones,Luxurious Pink Denim With Buttons,豪华的粉红色牛仔布和纽扣,PINK,S|M|L|XL,19.55,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
3,Feminine,Dresses and Jumpsuits,Retrô Tricot Preto Estampado,Black Tricot Gedruckter Tricot,Tricot Imprimé En Tricot Noir,Tricot Negro Tricot Impreso,Black Tricot Printed Tricot,黑色三角形印刷三角形,BLACK,S|M|L|XL,25.59,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
4,Feminine,Shirts and Blouses,Blusa De Algodão Básica,Basis -Baumwollbluse,Chemisier En Coton De Base,Blusa De Algodón,Basic Cotton Blouse,基本的棉衬衫,,S|M|L|XL,27.62,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
5,Feminine,T-shirts and Tops,T-Shirt Básica De Algodão,Basis-Baumwoll-T-Shirt,T-Shirt En Coton De Base,Camiseta Básica De Algodón,Basic Cotton T-Shirt,基本棉T恤,,S|M|L,11.69,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv


### Silver Products Schema Reference

In [0]:
SILVER_PRODUCTS_SCHEMA

{'product_id': 'integer',
 'category': 'string',
 'sub_category': 'string',
 'description_PT': 'string',
 'description_DE': 'string',
 'description_FR': 'string',
 'description_ES': 'string',
 'description_EN': 'string',
 'description_ZH': 'string',
 'color': 'string',
 'sizes': 'string',
 'production_cost': 'double'}

### Schema Enforcement and Column Name Standardizing

In [0]:
prod_silver_df = prod_bronze_df.select(
    col("Product ID").cast(IntegerType()).alias("product_id"),
    col("Category").cast(StringType()).alias("category"),
    col("Sub Category").cast(StringType()).alias("sub_category"),
    col("Description PT").cast(StringType()).alias("description_PT"),
    col("Description DE").cast(StringType()).alias("description_DE"),
    col("Description FR").cast(StringType()).alias("description_FR"),
    col("Description ES").cast(StringType()).alias("description_ES"),  
    col("Description EN").cast(StringType()).alias("description_EN"),
    col("Description ZH").cast(StringType()).alias("description_ZH"),
    col("Color").cast(StringType()).alias("color"),
    col("Sizes").cast(StringType()).alias("sizes"),
    col("Production Cost").cast(DoubleType()).alias("production_cost"),
    col("ingestion_ts"),
    col("_source_file")
)


### Cleaning Empty spaces and Formatting values

In [0]:
prod_silver_df = (
    prod_silver_df.withColumn("category", initcap(trim(col("category"))))
    .withColumn("sub_category", initcap(trim(col("sub_category"))))
    .withColumn("description_PT", trim(col("description_PT")))
    .withColumn("description_DE", trim(col("description_DE")))
    .withColumn("description_FR", trim(col("description_FR")))
    .withColumn("description_ES", trim(col("description_ES")))
    .withColumn("description_EN", trim(col("description_EN")))
    .withColumn("description_ZH", trim(col("description_ZH")))
    .withColumn("color", upper(trim(col("color"))))
    .withColumn("sizes", upper(trim(col("sizes"))))
)

In [0]:
prod_silver_df.limit(5).display()

product_id,category,sub_category,description_PT,description_DE,description_FR,description_ES,description_EN,description_ZH,color,sizes,production_cost,ingestion_ts,_source_file
1,Feminine,Coats And Blazers,Esportivo Veludo Verde Com Botões,Sport Samt Sport Mit Knöpfen,Sports Velvet Sports Avec Des Boutons,Deportes De Terciopelo Con Botones,Sports Velvet Sports With Buttons,运动天鹅绒运动与按钮,,S|M|L|XL,10.73,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
2,Feminine,Sweaters And Knitwear,Luxuoso Denim Rosa Com Botões,Luxuriöser Rosa Jeans Mit Knöpfen,Léchard De Denim Rose Avec Boutons,Denim Rosa Lujoso Con Botones,Luxurious Pink Denim With Buttons,豪华的粉红色牛仔布和纽扣,PINK,S|M|L|XL,19.55,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
3,Feminine,Dresses And Jumpsuits,Retrô Tricot Preto Estampado,Black Tricot Gedruckter Tricot,Tricot Imprimé En Tricot Noir,Tricot Negro Tricot Impreso,Black Tricot Printed Tricot,黑色三角形印刷三角形,BLACK,S|M|L|XL,25.59,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
4,Feminine,Shirts And Blouses,Blusa De Algodão Básica,Basis -Baumwollbluse,Chemisier En Coton De Base,Blusa De Algodón,Basic Cotton Blouse,基本的棉衬衫,,S|M|L|XL,27.62,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
5,Feminine,T-shirts And Tops,T-Shirt Básica De Algodão,Basis-Baumwoll-T-Shirt,T-Shirt En Coton De Base,Camiseta Básica De Algodón,Basic Cotton T-Shirt,基本棉T恤,,S|M|L,11.69,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv


### Dropping Duplicate Rows

In [0]:
print(f"Before deduplicate count : {prod_silver_df.count()}")
prod_silver_df = prod_silver_df.dropDuplicates()
print(f"After deduplicate count : {prod_silver_df.count()}")

Before deduplicate count : 17940
After deduplicate count : 17940


### Filtering out null product_id, category, and sub_category rows

In [0]:
prod_silver_df = prod_silver_df.filter('(product_id is not null) and (category is not null) and (sub_category is not null)')

### validating the primary Key

In [0]:
prod_silver_df.groupBy("product_id").agg(count('*').alias("count")).filter('count > 1').display()

product_id,count


### Null Check

In [0]:
null_counts = prod_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in prod_silver_df.columns])
display(null_counts)

product_id,category,sub_category,description_PT,description_DE,description_FR,description_ES,description_EN,description_ZH,color,sizes,production_cost,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,0,12445,2070,0,0,0


### Handling Null Values

In [0]:
prod_silver_df = prod_silver_df.fillna("UnKnown", ["color", "sizes"])
prod_silver_df = prod_silver_df.fillna("Not Available", ["description_PT", "description_DE", "description_FR", "description_ES", "description_EN", "description_ZH"])

### Validating Nulls

In [0]:
null_counts = prod_silver_df.select([sum(col(c).isNull().cast(IntegerType())).alias(c) for c in prod_silver_df.columns])
display(null_counts)

product_id,category,sub_category,description_PT,description_DE,description_FR,description_ES,description_EN,description_ZH,color,sizes,production_cost,ingestion_ts,_source_file
0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Schema Enforcement Check

In [0]:
expected_cols = set(SILVER_PRODUCTS_SCHEMA.keys())
incoming_cols = set(prod_silver_df.columns)

unknown_cols = incoming_cols - expected_cols - {
    "ingestion_ts", "_source_file"
}

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating or Updating Silver Products Table

In [0]:
if not spark.catalog.tableExists(SILVER_PRODUCTS_PATH):
    (
        prod_silver_df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(SILVER_PRODUCTS_PATH)
    )
else:
    prod_silver_tbl = DeltaTable.forName(spark, SILVER_PRODUCTS_PATH)

    prod_silver_tbl.alias("tgt").merge(
        prod_silver_df.alias("src"),
        "tgt.product_id = src.product_id"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
spark.read.table(SILVER_PRODUCTS_PATH).limit(5).display()

product_id,category,sub_category,description_PT,description_DE,description_FR,description_ES,description_EN,description_ZH,color,sizes,production_cost,ingestion_ts,_source_file
3388,Feminine,Suits And Sets,Conjunto De Blusa De Malha E Calça Com Detalhes De Drapeado,Set Von Gestrickten Blusen Und Hosen Mit Drapierten Details,Ensemble De Chemisier Tricoté Et De Pantalons Avec Des Détails Drapés,Conjunto De Blusa Y Pantalones Tejidos Con Detalles Drapeados,Set Of Knitted Blouse And Pants With Draped Details,一套针织上衣和带有细节的裤子,UnKnown,S|M|L,47.68,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
4324,Feminine,Suits And Sets,Fato De Saia Longa E Blusa De Manga Curta,Longrock Und Kurze Bluse,Fait De La Jupe Longue Et Du Chemisier À Couture Courte,Hecho De Falda Larga Y Blusa Corta,Fact Of Long Skirt And Short -Sleeved Blouse,长裙和短裙上衣的事实,UnKnown,S|M|L|XL,15.0,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
5088,Masculine,Sportswear,Jaqueta Masculina De Corrida Com Zíper Total,Herrenjacke Mit Totalem Reißverschluss,Veste Pour Hommes Coulant Avec Une Fermeture Éclair Totale,Chaqueta Para Hombres Corriendo Con Cremallera Total,Men'S Jacket Running With Total Zipper,男士夹克跑步齐全,UnKnown,M|L|XL|XXL,15.13,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
6593,Masculine,Shirts,Camisa Masculina Com Manga Dobrável,Herrenhemd Mit Faltem Ärmel,Chemise Pour Hommes Avec Manche Pliante,Camisa De Hombres Con Manga Plegable,Men'S Shirt With Folding Sleeve,男士衬衫带折叠套,UnKnown,M|L|XL|XXL,12.17,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv
7737,Masculine,Shirts,Camisa Masculina De Manga Curta Com Estampa De Animais,Kurzes Hemd Für Männer Mit Tierdruck,Short-Sleeved Men'S Shirt With Animal Print,Camisa Para Hombres De Manga Corta Con Estampado De Animales,Short -Sleeved Men'S Shirt With Animal Print,短 - 戴上动物印花的男士衬衫,UnKnown,M|L|XL|XXL,20.07,2026-01-14T05:45:59.737Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/products.csv


In [0]:
spark.read.table(SILVER_PRODUCTS_PATH).count()

17940