In [0]:
from pyspark.sql.functions import *

In [0]:
# Load the "customers" Delta table into a Spark DataFrame
products = spark.read.format('delta').table('stoyan.bronze_products')
products.display()

PRODUCT_ID,PRODUCT_NAME,CATEGORY_NAME,WEIGHT_CLASS,PRODUCT_STATUS,LIST_PRICE,MIN_PRICE,batch_id,ingest_datetime
2243,Monitor 17/HR/F,hardware1,4,orderable,350.0,302.0,0,2024-12-13T17:31:26.892Z
3057,Monitor 17/SD,hardware1,4,orderable,369.0,320.0,0,2024-12-13T17:31:26.892Z
3061,Monitor 19/SD,hardware1,5,orderable,499.0,437.0,0,2024-12-13T17:31:26.892Z
2245,Monitor 19/SD/M,hardware1,5,orderable,512.0,420.0,0,2024-12-13T17:31:26.892Z
3065,Monitor 21/D,hardware1,5,orderable,999.0,875.0,0,2024-12-13T17:31:26.892Z
3331,Monitor 21/HR,hardware1,5,orderable,879.0,785.0,0,2024-12-13T17:31:26.892Z
2252,Monitor 21/HR/M,hardware1,5,obsolete,889.0,717.0,0,2024-12-13T17:31:26.892Z
3064,Monitor 21/SD,hardware1,5,planned,1023.0,909.0,0,2024-12-13T17:31:26.892Z
3155,Monitor Hinge - HD,hardware1,4,orderable,49.0,42.0,0,2024-12-13T17:31:26.892Z
3234,Monitor Hinge - STD,hardware1,3,orderable,39.0,34.0,0,2024-12-13T17:31:26.892Z


In [0]:
# Convert column names to lowercase 
new_col = []                                            # Initialize an empty list for new column names
for column in products.columns:
    new_col.append(column.lower())                      # Convert to lowercase
products = products.toDF(*new_col)                      # Rename columns in the DataFrame
products.display()

product_id,product_name,category_name,weight_class,product_status,list_price,min_price,batch_id,ingest_datetime
2243,Monitor 17/HR/F,hardware1,4,orderable,350.0,302.0,0,2024-12-13T17:31:26.892Z
3057,Monitor 17/SD,hardware1,4,orderable,369.0,320.0,0,2024-12-13T17:31:26.892Z
3061,Monitor 19/SD,hardware1,5,orderable,499.0,437.0,0,2024-12-13T17:31:26.892Z
2245,Monitor 19/SD/M,hardware1,5,orderable,512.0,420.0,0,2024-12-13T17:31:26.892Z
3065,Monitor 21/D,hardware1,5,orderable,999.0,875.0,0,2024-12-13T17:31:26.892Z
3331,Monitor 21/HR,hardware1,5,orderable,879.0,785.0,0,2024-12-13T17:31:26.892Z
2252,Monitor 21/HR/M,hardware1,5,obsolete,889.0,717.0,0,2024-12-13T17:31:26.892Z
3064,Monitor 21/SD,hardware1,5,planned,1023.0,909.0,0,2024-12-13T17:31:26.892Z
3155,Monitor Hinge - HD,hardware1,4,orderable,49.0,42.0,0,2024-12-13T17:31:26.892Z
3234,Monitor Hinge - STD,hardware1,3,orderable,39.0,34.0,0,2024-12-13T17:31:26.892Z


In [0]:
# Define a transformation function to process the "products" DataFrame

def transformation (products):
    # Rename the "ingest_datetime" column to "consume_timestamp" for clarity
    products = products.withColumnRenamed('ingest_datetime', 'consume_timestamp')
    # Drop the "batch_id" column as it is not needed after the data is ingested
    products = products.drop('batch_id')
    # Add a new column "load_tsp" to record the current timestamp for auditing
    products = products.withColumn('load_tsp', current_timestamp())
    # Return the transformed DataFrame
    return products
transformed_products = transformation(products)
transformed_products.display()

PRODUCT_ID,PRODUCT_NAME,CATEGORY_NAME,WEIGHT_CLASS,PRODUCT_STATUS,LIST_PRICE,MIN_PRICE,consume_timestamp,load_tsp
2243,Monitor 17/HR/F,hardware1,4,orderable,350.0,302.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3057,Monitor 17/SD,hardware1,4,orderable,369.0,320.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3061,Monitor 19/SD,hardware1,5,orderable,499.0,437.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
2245,Monitor 19/SD/M,hardware1,5,orderable,512.0,420.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3065,Monitor 21/D,hardware1,5,orderable,999.0,875.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3331,Monitor 21/HR,hardware1,5,orderable,879.0,785.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
2252,Monitor 21/HR/M,hardware1,5,obsolete,889.0,717.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3064,Monitor 21/SD,hardware1,5,planned,1023.0,909.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3155,Monitor Hinge - HD,hardware1,4,orderable,49.0,42.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z
3234,Monitor Hinge - STD,hardware1,3,orderable,39.0,34.0,2024-12-13T17:31:26.892Z,2024-12-14T17:02:34.207Z


In [0]:
# new_transformed_products=  transformed_products.filter(col('consume_timestamp') > to_date(lit('2024-12-16')))
# new_transformed_products.write.format('delta').mode('append').saveAsTable('stoyan.silver__products')

In [0]:
# Write the transformed data into the "stoyan.silver__products" Delta table in append mode

transformed_products.write.format('delta').mode('overwrite').saveAsTable('stoyan.silver__products')