In [0]:
# ------------------------------
# 
# This script does the following:
# 
#     1. Reads the bronze table '

# 
# --------------------------------------


import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, DateType, TimestampType, FloatType

catalog_name = 'ecommerce'


## Brands

In [0]:
df_bronze = spark.table(f'{catalog_name}.bronze.brz_brands')
display(df_bronze.limit(10))
# Columns that needs transformation:


In [0]:
df_silver = df_bronze.withColumn('brand_name',F.trim(F.col('brand_name')))\
              .withColumn("brand_code", F.regexp_replace(F.col("brand_code"), r'[^A-Za-z0-9]', ''))
display(df_silver.limit(10))


In [0]:
df_silver.select("category_code").distinct().show()
# Ununified categories codes

In [0]:
anomalies = {
    "GROCERY": "GRCY",
    "BOOKS": "BKS",
    "TOYS": "TOY"
}

df_silver = df_silver.replace(to_replace=anomalies, subset=["category_code"])

# Show results
df_silver.select("category_code").distinct().show()


In [0]:
# Writing

df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_brands")

## Category

In [0]:
df_bronze = spark.table(f"{catalog_name}.bronze.brz_category")
display(df_bronze.limit(5))

In [0]:
# There is duplicates in the db 
df_duplicates = df_bronze.groupBy('category_code').count().where(F.col('count')>1)
display(df_duplicates.limit(5))

In [0]:
df_silver = df_bronze.dropDuplicates(['category_code'])
display(df_silver)

In [0]:
df_silver = df_silver.withColumn("category_code", F.upper(F.col("category_code")))
display(df_silver)

In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_category")

## Products

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_products")

# Get row and column count
row_count, column_count = df_bronze.count(), len(df_bronze.columns)

# Print the results
print(f"Row count: {row_count}")
print(f"Column count: {column_count}")

In [0]:
display(df_bronze.limit(5))

In [0]:
df_silver = df_bronze.withColumn('category_code',F.upper(F.col('category_code')))
display(df_silver.limit(5))

In [0]:
# The weights have a g in the end 
df_bronze.select("weight_grams")
df_silver=df_silver.withColumn('weight_grams',F.regexp_replace(F.col('weight_grams'),'g','').cast(FloatType()))
display(df_silver.limit(5))

In [0]:
df_silver=(df_silver
           .withColumn('length_cm',
                      F.round(F.regexp_replace(F.col('length_cm'),',','.')
                              .cast(FloatType())
                              ,2
                              )
                      ))

# To upper case
df_silver = df_silver.withColumn("category_code",F.upper(F.col('category_code')))\
                    .withColumn("brand_code",F.upper(F.col('brand_code')))

# There are spelling mistakes in material column
anomalies = {
    "Cotton": "Coton",
    "Alumium": "Aluminum",
    "Ruber": "Rubber"
}

# Negative values in the rating_count column and nan values 
df_silver = (
            df_silver.withColumn(
                'rating_count',
                F.when(
                    F.col('rating_count').isNotNull(),
                    F.abs(F.col('rating_count'))
                )
                .otherwise(F.lit(0)) # Replace the null values with 0
             )
)

df_silver = df_silver.replace(to_replace=anomalies, subset=["material"])

#Check the cleaned data 

df_silver.select(
    "weight_grams",
    "length_cm",
    "category_code",
    "brand_code",
    "material",
    "rating_count"
).show(10, truncate=False)



In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_products")