In [24]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("spark batch processing application") \
        .config("spark.sql.shuffle.partitions",3) \
        .master("local[2]") \
        .getOrCreate()

In [25]:
from pyspark.sql.functions import *

In [26]:
orders_schema = "order_id long,customer_id long,customer_fname string,customer_lname string,city string,state string,pincode long,line_items array<struct<order_item_id: long,order_item_product_id: long,order_item_quantity: long,order_item_product_price: float,order_item_subtotal: float>>"

In [27]:
orders_df = spark \
.read \
.format("json") \
.schema(orders_schema) \
.option("path","../data/retail_db/data_json_orders") \
.load()

In [28]:
orders_df.createOrReplaceTempView("orders")
   

In [29]:
spark.sql("select * from orders").show()

+--------+-----------+--------------+--------------+-------------+-----+-------+--------------------+
|order_id|customer_id|customer_fname|customer_lname|         city|state|pincode|          line_items|
+--------+-----------+--------------+--------------+-------------+-----+-------+--------------------+
|       1|      11599|          Mary|        Malone|      Hickory|   NC|  28601|[{1, 957, 1, 299....|
|       2|        256|         David|     Rodriguez|      Chicago|   IL|  60625|[{2, 1073, 1, 199...|
|       4|       8827|         Brian|        Wilson|  San Antonio|   TX|  78240|[{5, 897, 2, 24.9...|
|       5|      11318|          Mary|         Henry|       Caguas|   PR|    725|[{9, 957, 1, 299....|
|       7|       4530|          Mary|         Smith|        Miami|   FL|  33161|[{14, 1073, 1, 19...|
|       8|       2911|          Mary|         Smith|       Caguas|   PR|    725|[{17, 365, 3, 59....|
|       9|       5657|          Mary|         James|     Lakewood|   OH|  44107|[{

In [30]:
exploded_orders = spark.sql("""select order_id,customer_id,city,state,
pincode,explode(line_items) lines from orders""")

In [31]:
exploded_orders.show()

+--------+-----------+-----------+-----+-------+--------------------+
|order_id|customer_id|       city|state|pincode|               lines|
+--------+-----------+-----------+-----+-------+--------------------+
|       1|      11599|    Hickory|   NC|  28601|{1, 957, 1, 299.9...|
|       2|        256|    Chicago|   IL|  60625|{2, 1073, 1, 199....|
|       2|        256|    Chicago|   IL|  60625|{3, 502, 5, 50.0,...|
|       2|        256|    Chicago|   IL|  60625|{4, 403, 1, 129.9...|
|       4|       8827|San Antonio|   TX|  78240|{5, 897, 2, 24.99...|
|       4|       8827|San Antonio|   TX|  78240|{6, 365, 5, 59.99...|
|       4|       8827|San Antonio|   TX|  78240|{7, 502, 3, 50.0,...|
|       4|       8827|San Antonio|   TX|  78240|{8, 1014, 4, 49.9...|
|       5|      11318|     Caguas|   PR|    725|{9, 957, 1, 299.9...|
|       5|      11318|     Caguas|   PR|    725|{10, 365, 5, 59.9...|
|       5|      11318|     Caguas|   PR|    725|{11, 1014, 2, 49....|
|       5|      1131

In [32]:
exploded_orders.createOrReplaceTempView("exploded_orders")

In [33]:
flattened_orders = spark.sql("""select order_id, customer_id, city, state, pincode, 
lines.order_item_id as item_id, lines.order_item_product_id as product_id,
lines.order_item_quantity as quantity,lines.order_item_product_price as price,
lines.order_item_subtotal as subtotal from exploded_orders""")

In [34]:
flattened_orders.show()

+--------+-----------+-----------+-----+-------+-------+----------+--------+------+--------+
|order_id|customer_id|       city|state|pincode|item_id|product_id|quantity| price|subtotal|
+--------+-----------+-----------+-----+-------+-------+----------+--------+------+--------+
|       1|      11599|    Hickory|   NC|  28601|      1|       957|       1|299.98|  299.98|
|       2|        256|    Chicago|   IL|  60625|      2|      1073|       1|199.99|  199.99|
|       2|        256|    Chicago|   IL|  60625|      3|       502|       5|  50.0|   250.0|
|       2|        256|    Chicago|   IL|  60625|      4|       403|       1|129.99|  129.99|
|       4|       8827|San Antonio|   TX|  78240|      5|       897|       2| 24.99|   49.98|
|       4|       8827|San Antonio|   TX|  78240|      6|       365|       5| 59.99|  299.95|
|       4|       8827|San Antonio|   TX|  78240|      7|       502|       3|  50.0|   150.0|
|       4|       8827|San Antonio|   TX|  78240|      8|      1014|   

In [35]:
flattened_orders.createOrReplaceTempView("orders_flattened")

In [36]:
aggregated_orders = spark.sql("""select customer_id, count(distinct(order_id)) as orders_placed, 
count(item_id) as products_purchased,sum(subtotal) as amount_spent 
from orders_flattened group by customer_id""")

In [37]:
aggregated_orders.createOrReplaceTempView("orders_aggregated")

In [38]:
spark.sql("select * from orders_aggregated where customer_id = 256").show()

+-----------+-------------+------------------+-----------------+
|customer_id|orders_placed|products_purchased|     amount_spent|
+-----------+-------------+------------------+-----------------+
|        256|           10|                22|4169.480072021484|
+-----------+-------------+------------------+-----------------+



In [39]:
aggregated_orders \
.repartition(1) \
.write \
.format("csv") \
.mode("overwrite") \
.option("header",True) \
.option("path","../data/retail_db/json_output_result1") \
.save()

In [40]:
spark.stop()