In [0]:
spark.sql("use database Retail_DB_Second_Layer")
spark.sql("""
          create table if not exists silver_orders(
             transaction_id string,
            customer_id integer,
            product_id integer,
            quantity integer,
            total_amount double,
            order_status string,
            transaction_date string,
            payment_method string,
            store_type string,
            last_updated_at timestamp
          )""")

DataFrame[]

In [0]:
#It's just to make sure of incremental data coming in
last_updated_df = spark.sql('select max(last_updated_at) as last_completed from silver_orders')
last_updated_time = last_updated_df.collect()[0]['last_completed']
if last_updated_time is None:
    last_updated_time = '1900-01-01T00:00:00.000+00:00'

In [0]:
#creating temporary view just to run in this case
spark.sql(f"""
          create or replace temporary view orders_incremental as
          select * from Retail_DB_Initial.Bronze_orders as c where c.recived_at  > '{last_updated_time}' """)

DataFrame[]

In [0]:
spark.sql("select *  from orders_incremental limit 10").show()

+--------------+-----------+----------+--------+------------+----------------+--------------+--------------+--------------------+
|transaction_id|customer_id|product_id|quantity|total_amount|transaction_date|payment_method|    store_type|          recived_at|
+--------------+-----------+----------+--------+------------+----------------+--------------+--------------+--------------------+
|     TRX000001|        802|       425|       1|       363.4|      2020-07-27|    Debit Card|Physical Store|2024-12-24 02:58:...|
|     TRX000002|        858|       280|       6|      758.18|      2022-08-10|   Credit Card|Physical Store|2024-12-24 02:58:...|
|     TRX000003|        658|       694|       9|      748.66|      2020-05-22| Bank Transfer|        Online|2024-12-24 02:58:...|
|     TRX000004|        516|       930|       4|      933.78|            NULL| Bank Transfer|Physical Store|2024-12-24 02:58:...|
|     TRX000005|        368|       104|      10|      137.28|      2022-06-24|        PayP

In [0]:
spark.sql("select count(*)  from orders_incremental").show()

+--------+
|count(*)|
+--------+
|   20000|
+--------+



In [0]:
spark.sql("""
          create or replace temporary view orders_incremental_view as
          select 
          transaction_id,
          customer_id,
          product_id,
          case 
            when quantity < 0 then 0
            else quantity
          end as quantity,
          case 
            when total_amount < 0 then 0
            else total_amount
          end as total_amount,
          case
            when transaction_date is not null then to_date(transaction_date, 'yyyy-MM-dd')
            else null
          end as transaction_date,
          case 
            when total_amount = 0 or quantity = 0 then 'Cancelled'
            else 'Completed'
          end as order_status,
          case 
            when payment_method is not null then trim(payment_method)
            else 'Unknown'
          end as payment_method,
          case 
            when store_type is not null then initcap(trim(store_type))
            else 'Unknown'
          end as store_type,
          current_timestamp() as last_updated_at
          from orders_incremental 
          where transaction_id is not null and 
          customer_id is not null and 
          product_id is not null and to_date(transaction_date, 'yyyy-MM-dd') <= current_date()
          """)

DataFrame[]

In [0]:
spark.sql(""" 
          merge into silver_orders target
          using orders_incremental_view source
          on source.transaction_id = target.transaction_id
          when matched then
          update set *
          when not matched then 
          insert *""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("select count(*) from silver_orders").show()

+--------+
|count(*)|
+--------+
|   19428|
+--------+

