In [0]:
from pyspark.sql.functions import *

In [0]:
# Load the "order_items" Delta table into a Spark DataFrame
order_items =spark.read.format('delta').table('stoyan.bronze_order_items')
order_items.display()

ORDER_ID,LINE_ITEM_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY,batch_id,ingest_datetime
2392,4,3124,77.0,63,0,2024-12-13T17:48:09.096Z
2393,6,3082,78.0,10,0,2024-12-13T17:48:09.096Z
2394,6,3134,18.0,45,0,2024-12-13T17:48:09.096Z
2399,6,2311,86.9,20,0,2024-12-13T17:48:09.096Z
2400,4,2999,880.0,16,0,2024-12-13T17:48:09.096Z
2411,6,3106,45.0,11,0,2024-12-13T17:48:09.096Z
2412,6,3139,20.0,79,0,2024-12-13T17:48:09.096Z
2418,4,3110,45.0,20,0,2024-12-13T17:48:09.096Z
2419,4,3129,43.0,57,0,2024-12-13T17:48:09.096Z
2421,6,3129,43.0,172,0,2024-12-13T17:48:09.096Z


In [0]:
# Convert column names to lowercase and replace '.' with '_' for standardization
new_col = []
for column in order_items.columns:
    new_col.append(column.lower())                      # Convert to lowercase
order_items = order_items.toDF(*new_col)                # Rename columns in the DataFrame
order_items.display()

order_id,line_item_id,product_id,unit_price,quantity,batch_id,ingest_datetime
2392,4,3124,77.0,63,0,2024-12-13T17:48:09.096Z
2393,6,3082,78.0,10,0,2024-12-13T17:48:09.096Z
2394,6,3134,18.0,45,0,2024-12-13T17:48:09.096Z
2399,6,2311,86.9,20,0,2024-12-13T17:48:09.096Z
2400,4,2999,880.0,16,0,2024-12-13T17:48:09.096Z
2411,6,3106,45.0,11,0,2024-12-13T17:48:09.096Z
2412,6,3139,20.0,79,0,2024-12-13T17:48:09.096Z
2418,4,3110,45.0,20,0,2024-12-13T17:48:09.096Z
2419,4,3129,43.0,57,0,2024-12-13T17:48:09.096Z
2421,6,3129,43.0,172,0,2024-12-13T17:48:09.096Z


In [0]:
# Define a transformation function to process the "order_items" DataFrame

def transfomation (order_items):
    # Rename "ingest_datetime" to "consume_timestamp" for better clarity
    order_items = order_items.withColumnRenamed('ingest_datetime','consume_timestamp')
    # Drop the "batch_id" column as it is not required in the transformed data
    order_items = order_items.drop('batch_id')
    # Add a new column "load_tsp" to record the current timestamp for auditing
    order_items = order_items.withColumn('load_tsp', current_timestamp())
    return order_items  #  Return the transformed DataFrame
transformed_order_items = transfomation(order_items)
transformed_order_items.display()

order_id,line_item_id,product_id,unit_price,quantity,consume_timestamp,load_tsp
2392,4,3124,77.0,63,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2393,6,3082,78.0,10,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2394,6,3134,18.0,45,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2399,6,2311,86.9,20,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2400,4,2999,880.0,16,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2411,6,3106,45.0,11,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2412,6,3139,20.0,79,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2418,4,3110,45.0,20,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2419,4,3129,43.0,57,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z
2421,6,3129,43.0,172,2024-12-13T17:48:09.096Z,2024-12-13T21:51:00.369Z


In [0]:
# new_transformed_order_items =  transformed_order_items.filter(col('consume_timestamp') > to_date(lit('2024-12-16')))
# new_transformed_order_items.write.format('delta').mode('append').saveAsTable('stoyan.silver__order_items')

In [0]:
# Write the filtered data into the "stoyan.silver__order_items" Delta table in append mode
transformed_order_items.write.mode('delta').mode('overwrite').saveAsTable('stoyan.silver__order_items')
