In [0]:
from pyspark.sql.types import StringType, IntegerType, DateType, BooleanType
import pyspark.sql.functions as F

In [0]:
catalog_name='ecommerce'

In [0]:
df =spark.table(f'{catalog_name}.bronze.brz_order_items')
df.show()

In [0]:
df.printSchema()

In [0]:
df=df.dropDuplicates(subset=['order_id','item_seq'])

In [0]:
df=df.withColumn(
    'quantity',
    F.regexp_replace(F.col('quantity'), 'Two', 2).cast('int')
)

In [0]:
df.select('quantity').distinct().show()

In [0]:
df = df.withColumn(
    "unit_price",
    F.regexp_replace("unit_price", "[$]", "").cast("double")
)

In [0]:
df.select('unit_price').distinct().show()

In [0]:
df=df.withColumn(
    'discount_pct',
    F.regexp_replace('discount_pct', '[%]','').cast('double')
)

In [0]:
df.select('discount_pct').distinct().show()

In [0]:
df.select('coupon_code').distinct().show()

In [0]:
df=df.withColumn(
    'coupon_code',
    F.lower(F.trim(F.col('coupon_code')))
)

In [0]:
df.select('coupon_code').distinct().show()

In [0]:
df.select('channel').distinct().show()

In [0]:
df=df.withColumn(
    'dt',
    F.to_date('dt','yyyy-MM-dd')
)

In [0]:
df=df.withColumn(
    'order_ts',
    F.coalesce(
        F.to_timestamp('order_ts','yyyy-MM-dd HH:mm:ss'), # matches 2025-08-01 22:53:52
        F.to_timestamp('order_ts','yyyy-MM-dd HH:mm') # fallback for 01-08-2025 22:53
    )
)

In [0]:
df=df.withColumn(
    'item_seq',
    F.col('item_seq').cast('int')
)

In [0]:
df=df.withColumn(
    'tax_amount',
    F.regexp_replace('tax_amount', r'[^0-9.\-]', '').cast('double')
)

In [0]:
df=df.withColumn(
    'processed_time', F.current_timestamp()
)

In [0]:
display(df.limit(5))

In [0]:
df.printSchema()

In [0]:
df.write.format('delta') \
    .mode('overwrite') \
    .option('mergeSchema','true') \
    .saveAsTable(f'{catalog_name}.silver.slv_order_items')