In [0]:
spark

In [0]:
from pyspark.sql.functions import *

In [0]:
orders_schema = "order_id long,customer_id long,customer_fname string,customer_lname string,city string,state string,pincode long,line_items array<struct<order_item_id: long,order_item_product_id: long,order_item_quantity: long,order_item_product_price: float,order_item_subtotal: float>>"

In [0]:
dbutils.fs.mkdirs("dbfs:/FileStore/streaming_input/input1")

True

In [0]:
orders_df = spark \
.readStream \
.format("json") \
.schema(orders_schema) \
.option("path","dbfs:/FileStore/streaming_input/input1") \
.load()

In [0]:
orders_df.createOrReplaceTempView("orders")


In [0]:
exploded_orders = spark.sql("""select order_id,customer_id,city,state,
pincode,explode(line_items) lines from orders""")

In [0]:
exploded_orders.createOrReplaceTempView("exploded_orders")

In [0]:
flattened_orders = spark.sql("""select order_id, customer_id, city, state, pincode, 
lines.order_item_id as item_id, lines.order_item_product_id as product_id,
lines.order_item_quantity as quantity,lines.order_item_product_price as price,
lines.order_item_subtotal as subtotal from exploded_orders""")

In [0]:
flattened_orders.createOrReplaceTempView("orders_flattened")

In [0]:
aggregated_orders = spark.sql("""select customer_id, approx_count_distinct(order_id) as orders_placed, 
count(item_id) as products_purchased,sum(subtotal) as amount_spent 
from orders_flattened group by customer_id""")

In [0]:
streaming_query = aggregated_orders \
.writeStream \
.format("delta") \
.outputMode("complete") \
.option("checkpointLocation","checkpointdir103") \
.toTable("orders_result103")

In [0]:
spark.sql("select * from orders_result103").show()

+-----------+-------------+------------------+------------------+
|customer_id|orders_placed|products_purchased|      amount_spent|
+-----------+-------------+------------------+------------------+
|       9557|            1|                 4|1299.9200439453125|
|       8355|            1|                 4| 619.9200172424316|
|      11318|            1|                 5|1129.8600387573242|
|      11599|            3|                 7|1544.9200401306152|
|       6272|            1|                 1| 99.98999786376953|
|      10280|            1|                 1|179.97000122070312|
|        256|            1|                 6|1159.9600219726562|
|       8827|            1|                 4| 699.8500099182129|
|       5882|            1|                 1|119.97000122070312|
+-----------+-------------+------------------+------------------+



In [0]:
streaming_query.stop()

In [0]:
dbutils.fs.rm("dbfs:/FileStore/streaming_input/input1",True)

True