In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [4]:
spark = SparkSession.builder.master('local').appName('operDataFrames').enableHiveSupport().getOrCreate()
sc=spark.sparkContext

In [5]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

# Analytical functions 

### It is driven by windowing function 'Window' imported from 'pyspark.sql.window'

### Basic analytical functions 

These analytical functions are count(), sum(), min(), max() and avg()


Only partition by clause is applicable (No order by clause is needed)

#### Define the specification for windowing according to the problem statement

In [11]:
spec=Window.partitionBy(order_items.order_item_order_id)

In [12]:
order_items.withColumn('order_sum',sum(order_items.order_item_subtotal).over(spec)).show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|         order_sum|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+------------------+
|          348|                148|                  502|                  2|              100.0|                    50.0|            479.99|
|          349|                148|                  502|                  5|              250.0|                    50.0|            479.99|
|          350|                148|                  403|                  1|             129.99|                  129.99|            479.99|
|         1129|                463|                  365|                  4|             239.96|                   59.99| 829.9200000000001|
|     

In [14]:
order_items. \
withColumn('order_sum',sum(order_items.order_item_subtotal).over(spec)). \
withColumn('order_max',max(order_items.order_item_subtotal).over(spec)). \
withColumn('order_min',min(order_items.order_item_subtotal).over(spec)). \
withColumn('order_avg',avg(order_items.order_item_subtotal).over(spec)). \
withColumn('order_count',count(order_items.order_item_subtotal).over(spec)). \
withColumn('order_count1',count(lit("1")).over(spec)). \
drop('order_item_product_id','order_item_quantity','order_item_product_price'). \
orderBy('order_item_order_id').show()

+-------------+-------------------+-------------------+------------------+---------+---------+------------------+-----------+------------+
|order_item_id|order_item_order_id|order_item_subtotal|         order_sum|order_max|order_min|         order_avg|order_count|order_count1|
+-------------+-------------------+-------------------+------------------+---------+---------+------------------+-----------+------------+
|            1|                  1|             299.98|            299.98|   299.98|   299.98|            299.98|          1|           1|
|            3|                  2|              250.0|            579.98|    250.0|   129.99|193.32666666666668|          3|           3|
|            4|                  2|             129.99|            579.98|    250.0|   129.99|193.32666666666668|          3|           3|
|            2|                  2|             199.99|            579.98|    250.0|   129.99|193.32666666666668|          3|           3|
|            8|            

## Numbering Analytical functions 

These analytical functions are rank(), dense_rank(), row_number(), lead(), lag() and ntile()

Both partitionBy and orderBy clauses must be specified for windowing 

In [17]:
spec=Window.partitionBy(order_items.order_item_order_id).orderBy(order_items.order_item_subtotal.desc())

In [19]:
order_items.withColumn('rnk',rank().over(spec)).show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+---+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|rnk|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+---+
|          349|                148|                  502|                  5|              250.0|                    50.0|  1|
|          350|                148|                  403|                  1|             129.99|                  129.99|  2|
|          348|                148|                  502|                  2|              100.0|                    50.0|  3|
|         1132|                463|                  191|                  3|             299.97|                   99.99|  1|
|         1130|                463|                  502|                  5|              250.0|              

In [22]:
order_items. \
withColumn('rnk',rank().over(spec)). \
withColumn('drnk',dense_rank().over(spec)). \
withColumn('rn',row_number().over(spec)). \
withColumn('ld',lead(order_items.order_item_subtotal).over(spec)). \
withColumn('lg',lag(order_items.order_item_subtotal).over(spec)). \
drop('order_item_product_id','order_item_quantity','order_item_product_price'). \
orderBy('order_item_order_id').show()

+-------------+-------------------+-------------------+---+----+---+------+------+
|order_item_id|order_item_order_id|order_item_subtotal|rnk|drnk| rn|    ld|    lg|
+-------------+-------------------+-------------------+---+----+---+------+------+
|            1|                  1|             299.98|  1|   1|  1|  null|  null|
|            2|                  2|             199.99|  2|   2|  2|129.99| 250.0|
|            4|                  2|             129.99|  3|   3|  3|  null|199.99|
|            3|                  2|              250.0|  1|   1|  1|199.99|  null|
|            5|                  4|              49.98|  4|   4|  4|  null| 150.0|
|            7|                  4|              150.0|  3|   3|  3| 49.98|199.92|
|            8|                  4|             199.92|  2|   2|  2| 150.0|299.95|
|            6|                  4|             299.95|  1|   1|  1|199.92|  null|
|           12|                  5|             299.98|  1|   1|  2|299.95|299.98|
|   