Use case - Spark DataFrame and SQL:

In [1]:
import findspark
findspark.init
import getpass
from pyspark.sql import SparkSession

username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config("spark.sql.catalogImplementation", "hive"). \
    config("spark.sql.warehouse.dir",f"/Users/{username}/Documents/data/warehouse"). \
    enableHiveSupport(). \
    master("local"). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/25 20:36:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
orders_df = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load("/Users/sugumarsrinivasan/Documents/data/orders_wh.csv")

In [3]:
orders_df.show(5)

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-27 00:00:00|      30265|      CLOSED|
|       2|2013-11-25 00:00:00|      20386|      CLOSED|
|       3|2014-01-21 00:00:00|      15768|    COMPLETE|
|       4|2014-07-04 00:00:00|      27181|  PROCESSING|
|       5|2014-03-08 00:00:00|      12448|    COMPLETE|
+--------+-------------------+-----------+------------+
only showing top 5 rows



In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [5]:
orders_df.createOrReplaceTempView("orders")

1. Top 15 customers who placed the most number of orders

In [9]:
result = orders_df.groupBy("customer_id").count().sort("count",ascending = False).limit(15)
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|      42247|   11|
|      30302|   10|
|      48452|   10|
|       5305|   10|
|      44607|   10|
|      31823|    9|
|      39444|    9|
|       4244|    9|
|       9327|    9|
|      27462|    9|
|      34386|    9|
|      14519|    9|
|       8745|    9|
|      22450|    9|
|      28522|    9|
+-----------+-----+



In [11]:
result = spark.sql("select customer_id, count(*) as count from orders group by customer_id order by count desc limit 15")
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|      42247|   11|
|      30302|   10|
|      48452|   10|
|       5305|   10|
|      44607|   10|
|      31823|    9|
|      39444|    9|
|       4244|    9|
|       9327|    9|
|      27462|    9|
|      34386|    9|
|      14519|    9|
|       8745|    9|
|      22450|    9|
|      28522|    9|
+-----------+-----+



2. Find the number of orders under each order status

In [12]:
result = orders_df.groupBy("order_status").count()
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|25099|
|       COMPLETE|24837|
|     PROCESSING|25048|
|         CLOSED|25016|
+---------------+-----+



In [14]:
result = spark.sql("select order_status, count(order_id) as count from orders group by order_status")
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|25099|
|       COMPLETE|24837|
|     PROCESSING|25048|
|         CLOSED|25016|
+---------------+-----+



3. Number of Active customers(who placed atleast one order)

In [15]:
result = orders_df.select("customer_id").distinct().count()
print(result)

43241


In [16]:
spark.sql("select count(distinct(customer_id)) as active_customers from orders").show()


+----------------+
|active_customers|
+----------------+
|           43241|
+----------------+



4. customer with most number of closed orders

In [17]:
result = orders_df.filter("order_status = 'CLOSED'").groupBy("customer_id").count().sort("count", ascending = False)
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       9192|    6|
|       9207|    5|
|      42096|    5|
|      21078|    5|
|        624|    5|
|       4086|    5|
|      43851|    5|
|      35155|    5|
|      38525|    5|
|      33360|    5|
|      18797|    4|
|      38911|    4|
|      46049|    4|
|      48102|    4|
|      13351|    4|
|       8007|    4|
|      29566|    4|
|       8449|    4|
|      42119|    4|
|      10112|    4|
+-----------+-----+
only showing top 20 rows



In [19]:
result = spark.sql("select customer_id, count(*) as count from orders where order_status = 'CLOSED' group by customer_id order by count desc")
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       9192|    6|
|       9207|    5|
|      42096|    5|
|      21078|    5|
|        624|    5|
|       4086|    5|
|      43851|    5|
|      35155|    5|
|      38525|    5|
|      33360|    5|
|      18797|    4|
|      38911|    4|
|      46049|    4|
|      48102|    4|
|      13351|    4|
|       8007|    4|
|      29566|    4|
|       8449|    4|
|      42119|    4|
|      10112|    4|
+-----------+-----+
only showing top 20 rows

