In [0]:
from pyspark.sql.functions import *

In [0]:
# Load the "orders" Delta table into a Spark DataFrame
orders = spark.read.format('delta').table('stoyan.bronze_orders')
orders.display()

ORDER_ID,ORDER_DATE,ORDER_MODE,CUSTOMER_ID,ORDER_STATUS,ORDER_TOTAL,SALES_REP_ID,PROMOTION_ID,batch_id,ingest_datetime
2458,16-AUG-07 03.34.12.234359 PM,direct,101,0,78279.6,153.0,-,0,2024-12-15T20:00:52.137Z
2397,19-NOV-07 02.41.54.696211 PM,direct,102,1,42283.2,154.0,-,0,2024-12-15T20:00:52.137Z
2454,02-OCT-07 05.49.34.678340 PM,direct,103,1,6653.4,154.0,-,0,2024-12-15T20:00:52.137Z
2354,14-JUL-08 06.18.23.234567 PM,direct,104,0,46257.0,155.0,-,0,2024-12-15T20:00:52.137Z
2358,08-JAN-08 05.03.12.654278 PM,direct,105,2,7826.0,155.0,-,0,2024-12-15T20:00:52.137Z
2381,14-MAY-08 08.59.08.843679 PM,direct,106,3,23034.6,156.0,-,0,2024-12-15T20:00:52.137Z
2440,31-AUG-07 09.53.06.008765 PM,direct,107,3,70576.9,156.0,-,0,2024-12-15T20:00:52.137Z
2357,08-JAN-06 08.19.44.123456 PM,direct,108,5,59872.4,158.0,-,0,2024-12-15T20:00:52.137Z
2394,10-FEB-08 09.22.35.564789 PM,direct,109,5,21863.0,158.0,-,0,2024-12-15T20:00:52.137Z
2435,02-SEP-07 11.22.53.134567 PM,direct,144,6,62303.0,159.0,-,0,2024-12-15T20:00:52.137Z


In [0]:
# Convert column names to lowercase 
new_col = []                                        #  Initialize an empty list for new column names
for column in orders.columns:
    new_col.append(column.lower())                  # Convert to lowercase
orders = orders.toDF(*new_col)                      # Rename columns in the DataFrame
orders.display()

order_id,order_date,order_mode,customer_id,order_status,order_total,sales_rep_id,promotion_id,batch_id,ingest_datetime
2458,16-AUG-07 03.34.12.234359 PM,direct,101,0,78279.6,153.0,-,0,2024-12-15T20:00:52.137Z
2397,19-NOV-07 02.41.54.696211 PM,direct,102,1,42283.2,154.0,-,0,2024-12-15T20:00:52.137Z
2454,02-OCT-07 05.49.34.678340 PM,direct,103,1,6653.4,154.0,-,0,2024-12-15T20:00:52.137Z
2354,14-JUL-08 06.18.23.234567 PM,direct,104,0,46257.0,155.0,-,0,2024-12-15T20:00:52.137Z
2358,08-JAN-08 05.03.12.654278 PM,direct,105,2,7826.0,155.0,-,0,2024-12-15T20:00:52.137Z
2381,14-MAY-08 08.59.08.843679 PM,direct,106,3,23034.6,156.0,-,0,2024-12-15T20:00:52.137Z
2440,31-AUG-07 09.53.06.008765 PM,direct,107,3,70576.9,156.0,-,0,2024-12-15T20:00:52.137Z
2357,08-JAN-06 08.19.44.123456 PM,direct,108,5,59872.4,158.0,-,0,2024-12-15T20:00:52.137Z
2394,10-FEB-08 09.22.35.564789 PM,direct,109,5,21863.0,158.0,-,0,2024-12-15T20:00:52.137Z
2435,02-SEP-07 11.22.53.134567 PM,direct,144,6,62303.0,159.0,-,0,2024-12-15T20:00:52.137Z


In [0]:
# Define a transformation function to process the "orders" DataFrame

def transformation(orders):
    # Convert the "order_date" column from string format to a timestamp
    # Format: "dd-MMM-yy hh.mm.ss.SSSSSS a"
    orders= orders.withColumn("order_date", to_timestamp(col("order_date"), "dd-MMM-yy hh.mm.ss.SSSSSS a"))
    # Rename the "ingest_datetime" column to "consume_timestamp" for better clarity
    orders= orders.withColumnRenamed('ingest_datetime','consume_timestamp')
    # Drop the "batch_id" column as it is no longer needed in the transformed data
    orders = orders.drop('batch_id')
    # Add a new column "load_tsp" to record the current timestamp for auditing
    orders = orders.withColumn('load_tsp', current_timestamp())
    # Return the transformed DataFrame
    return orders

In [0]:
transformed_orders = transformation(orders)
transformed_orders.display()

ORDER_ID,order_date,ORDER_MODE,CUSTOMER_ID,ORDER_STATUS,ORDER_TOTAL,SALES_REP_ID,PROMOTION_ID,consume_timestamp,load_tsp
2458,2007-08-16T15:34:12.234359Z,direct,101,0,78279.6,153.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2397,2007-11-19T14:41:54.696211Z,direct,102,1,42283.2,154.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2454,2007-10-02T17:49:34.67834Z,direct,103,1,6653.4,154.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2354,2008-07-14T18:18:23.234567Z,direct,104,0,46257.0,155.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2358,2008-01-08T17:03:12.654278Z,direct,105,2,7826.0,155.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2381,2008-05-14T20:59:08.843679Z,direct,106,3,23034.6,156.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2440,2007-08-31T21:53:06.008765Z,direct,107,3,70576.9,156.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2357,2006-01-08T20:19:44.123456Z,direct,108,5,59872.4,158.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2394,2008-02-10T21:22:35.564789Z,direct,109,5,21863.0,158.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z
2435,2007-09-02T23:22:53.134567Z,direct,144,6,62303.0,159.0,-,2024-12-15T20:00:52.137Z,2024-12-15T20:18:07.713Z


In [0]:
# new_transformed_orders =  transformed_orders.filter(col('consume_timestamp') > to_date(lit('2024-12-16')))
# new_transformed_orders.write.format('delta').mode('append').saveAsTable('stoyan.silver__orders')

In [0]:
# Write the transformed data into the "stoyan.silver__orders" Delta table in append mode
transformed_orders.write.mode('delta').mode('overwrite').saveAsTable('stoyan.silver__orders')