In [0]:
import pyspark.sql.functions as F
import random

In [0]:
dbutils.widgets.text("env", "dev")
env = dbutils.widgets.get("env")
catalog = f"supply_{env}"


df_bronze = spark.table(f"{catalog}.bronze.makeup_supply_chain_raw")
df_bronze.limit(10).display()

In [0]:
# Add missing values
cols_with_nulls = ["Lead_times", "Shipping_costs", "Customer_demographics", "Inspection_results"]
for col in cols_with_nulls:
    df = df_bronze.withColumn(
        col,
        F.when(F.rand() < 0.12, None).otherwise(F.col(col))
    )



In [0]:
# add duplicate rows
duplicates = df.sample(withReplacement=True, fraction=0.07, seed=42)
df = df.union(duplicates)


# add inconsistent text formatting (product_type)
df = df.withColumn(
    "Product_type",
    F.when(F.rand() < 0.3, F.lower(F.col("Product_type")))
    .when(F.rand() < 0.6, F.upper(F.col("Product_type")))
    .when(F.rand() < 0.8, F.initcap(F.col("Product_type")))
    .otherwise(F.concat(F.col("Product_type"), F.lit(" ")))
)

# Outliers in Numeric columns
df = df.withColumn(
    "Shipping_costs",
    F.when(F.rand() < 0.03, F.col("Shipping_costs") * 5)
     .otherwise(F.col("Shipping_costs"))
)

df = df.withColumn(
    "Manufacturing_costs",
    F.when(F.rand() < 0.02, F.col("Manufacturing_costs") * 6)
     .otherwise(F.col("Manufacturing_costs"))
)

# Negative values in numeric columns
df = df.withColumn(
    "Stock_levels",
    F.when(F.rand() < 0.03, -F.abs(F.col("Stock_levels")))
     .otherwise(F.col("Stock_levels"))
)

df = df.withColumn(
    "Lead_times",
    F.when(F.rand() < 0.02, F.lit(-5))
     .otherwise(F.col("Lead_times"))
)



# corrupt inspection results
df = df.withColumn(
    "Inspection_results",
    F.when(F.rand() < 0.25, F.lit("Unknown"))
     .when(F.rand() < 0.35, F.lit("FAIL"))
     .otherwise(F.col("Inspection_results"))
)





In [0]:
# add leading and trailing spaces
string_cols = [
    "Product_Type", "SKU", "Supplier_name", 
    "Location", "Shipping_carriers", "Routes", "Transportation_modes"
]

for col in string_cols:
    df = df.withColumn(
        col,
        F.when(F.rand() < 0.15,
               F.concat(F.lit(" "), F.col(col), F.lit("  ")))
         .otherwise(F.col(col))
    )


# random case changes
for col in string_cols:
    df = df.withColumn(
        col,
        F.when(F.rand() < 0.1, F.upper(F.col(col)))
         .when(F.rand() < 0.2, F.lower(F.col(col)))
         .otherwise(F.col(col))
    )

# inject speacial characters
for col in ["Supplier_name", "Location", "Product_Type"]:
    df = df.withColumn(
        col,
        F.when(F.rand() < 0.08,
               F.concat(F.col(col), F.lit("#")))
         .when(F.rand() < 0.12,
               F.concat(F.lit("@"), F.col(col)))
         .otherwise(F.col(col))
    )

# add random noise to SKU
df = df.withColumn(
    "SKU",
    F.when(F.rand() < 0.1,
           F.concat(F.col("SKU"), F.lit("-X")))
     .when(F.rand() < 0.15,
           F.regexp_replace("SKU", "[0-9]", ""))
     .otherwise(F.col("SKU"))
)

# insert N?A values
df = df.withColumn(
    "Product_Type",
    F.when(F.rand() < 0.05, F.lit("N?A"))
     .otherwise(F.col("Product_Type"))
)

# random internal double spaces
for col in ["Product_Type", "Routes"]:
    df = df.withColumn(
        col,
        F.when(F.rand() < 0.1,
               F.regexp_replace(F.col(col), " ", "  "))
         .otherwise(F.col(col))
    )

In [0]:
# save the dirty data
df.write.mode("overwrite").saveAsTable(f"{catalog}.bronze.makeup_supply_chain_dirty")