In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('customer_order').master("local[*]").getOrCreate()

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

schema = StructType([
    StructField("cust_id", StringType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("amount", FloatType(), True)
])

data = spark.read.csv("customer-orders.csv", schema=schema)

In [14]:
data.show()

+-------+-------+------+
|cust_id|item_id|amount|
+-------+-------+------+
|     44|   8602| 37.19|
|     35|   5368| 65.89|
|      2|   3391| 40.64|
|     47|   6694| 14.98|
|     29|    680| 13.08|
|     91|   8900| 24.59|
|     70|   3959| 68.68|
|     85|   1733| 28.53|
|     53|   9900| 83.55|
|     14|   1505|  4.32|
|     51|   3378|  19.8|
|     42|   6926| 57.77|
|      2|   4424| 55.77|
|     79|   9291| 33.17|
|     50|   3901| 23.57|
|     20|   6633|  6.49|
|     15|   6148| 65.53|
|     44|   8331| 99.19|
|      5|   3505| 64.18|
|     48|   5539| 32.42|
+-------+-------+------+
only showing top 20 rows



In [15]:
from pyspark.sql.functions import sum, format_number

cust_data = data.select("cust_id", "amount").groupBy("cust_id").agg(format_number(sum("amount"),2).alias("Sum"))

cust_data = cust_data.orderBy(cust_data["Sum"].desc())



In [16]:
cust_data.show()

spark.stop()

+-------+--------+
|cust_id|     Sum|
+-------+--------+
|     68|6,375.45|
|     73|6,206.20|
|     39|6,193.11|
|     54|6,065.39|
|     71|5,995.66|
|      2|5,994.59|
|     97|5,977.19|
|     46|5,963.11|
|     42|5,696.84|
|     59|5,642.89|
|     41|5,637.62|
|      0|5,524.95|
|      8|5,517.24|
|     85|5,503.43|
|     61|5,497.48|
|     32|5,496.05|
|     58|5,437.73|
|     63|5,415.15|
|     15|5,413.51|
|      6|5,397.88|
+-------+--------+
only showing top 20 rows

