In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Windowing Functions'). \
    master('yarn'). \
    getOrCreate()


 ### Count as special case 
   - When we use count with groupBy then, it acts as a Transformation
   - else it acts as a Action

#### Transformations

- orderBy
- filter
- distinct
- join

### Actions

- show
- head
- tail 
- take 
- collect


### Utility Function

- printSchema
- cache
- createOrReplaceTempView

In [2]:
schema = "id Integer,order_date date, customer_id Integer, status String "


df = spark.read.format('csv')\
    .option('path','orders')\
    .schema(schema)\
    .load()

df.show(3)
    

+---+----------+-----------+---------------+
| id|order_date|customer_id|         status|
+---+----------+-----------+---------------+
|  1|2013-07-25|      11599|         CLOSED|
|  2|2013-07-25|        256|PENDING_PAYMENT|
|  3|2013-07-25|      12111|       COMPLETE|
+---+----------+-----------+---------------+
only showing top 3 rows



#### 15 customers who placed most number of orders

In [32]:
from pyspark.sql.functions import col
df.select('customer_id').groupBy('customer_id').count().sort(col('count').desc()).show(10)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|        569|   16|
|      12431|   16|
|       5897|   16|
|       6316|   16|
|       5624|   15|
|       5654|   15|
|      12284|   15|
|       5283|   15|
|        221|   15|
|       4320|   15|
+-----------+-----+
only showing top 10 rows



#### Find number of orders under each order status

In [37]:
df.groupBy('status').count().sort(col('count').desc()).show()

+---------------+-----+
|         status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8275|
|        PENDING| 7610|
|         CLOSED| 7557|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



#### Number of active customers who placed at least 1 order

In [46]:
df.select('customer_id').where("status in ('COMPLETE','CLOSED')").distinct().count()

11405

### Customer with most number of closed orders 

In [58]:
df.where("status in ('CLOSED')").groupBy('customer_id').count().sort("count",ascending=False).limit(14)

customer_id,count
1833,6
1363,5
1687,5
5493,5
7879,4
3631,4
2236,4
1521,4
7850,4
10111,4
