# Lesson 22 - Example: Warehouse Inventory

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Introduction

In this lesson, we will use a multi-table dataset to explore practical applications of joining, grouping, and aggregation operations. The dataset we will work with is a small dataset with fictional data representing the inventory of different products stored at different warehouses owned by a retail chain.

The first DataFrame we will create is `products`. This is a lookup table with information about each product sold by the retailer. This table contains the product ID number, product name, and unit price for each product.

In [0]:
products = spark.createDataFrame(
    data = [[101, 'Widget', 10.99],
            [102, 'Sprocket', 12.49],
            [103, 'Grommet', 9.99],
            [104, 'Doodad', 15.89]],
    schema = 'pid INTEGER, product STRING, price DOUBLE'
)
  
products.show()

We will now create the `warehouses` DataFrame. This is a lookup table with information about each warehouse owned by the retailer. This table contains the warehouse ID number, city, and region for each warehouse.

In [0]:
warehouses = spark.createDataFrame(
    data = [[101, 'St. Louis', 'A'],
            [102, 'Kansas City', 'A'],
            [103, 'Seattle', 'B'],
            [104, 'San Francisco', 'B'],       
            [105, 'Chicago', 'A']],
    schema = 'wid INTEGER, city STRING, region STRING'
)
    
warehouses.show()

Finally, we will create the `inventory` DataFrame. This table contains the bulk of the data. Each contains one row for each product/warehouse pair for which the given product is present at the given warehouse. The columns in this table represent the warehouse ID number, the product ID number, and the number of units of the product stored at that warehouse.

In [0]:
inventory = spark.createDataFrame(
    data = [[101, 101, 45], [101, 103, 23], [101, 104, 62],
            [102, 102, 12], [102, 103, 31], [103, 102, 54], 
            [103, 104, 24], [104, 101, 34], [104, 102, 23], 
            [104, 103, 18], [104, 104, 29], [105, 101, 57],
            [105, 103, 61]],
    schema = 'wid INTEGER, pid INTEGER, units INTEGER'
)

inventory.show()

## Inventory Summary

Our first task will be to restate the information provided in the `inventory` DataFrame, but with city and product names listed as opposed to warehouse and product IDs. We will also provide the total value of each product at each warehouse.

In [0]:
(
    inventory
    .join(warehouses, 'wid', 'left')
    .join(products, 'pid', 'left')
    .select(
        'city', 'product', 'units', 
        expr('ROUND(units * price, 2) AS value')
    )
    .sort('city', 'product')
    .show()
)

## Total Inventory By Product

We will now determine the total number of units of each type across all warehouses, as well as the total value for units of each product.

In [0]:
( #aggregation
    inventory
    .groupBy('pid')
    .agg(
        expr('SUM(units) AS total_units')
    )
    .join(products, 'pid', 'left')
  #join the lookup table   
    .select(
        'product', 'total_units',
        expr('ROUND(total_units*price, 2) AS total_value')
    )
  #sort  
    .sort('product')
    .show()
)

In [0]:
# NOT recommend. This solution is not ideal
(
    inventory
    .join(products, 'pid', 'left')
    .withColumn('value', expr('units * price'))
    .groupBy('product')
    .agg(
        expr('SUM(units) AS units'), 
        expr('ROUND(SUM(value), 2) AS value')
    )
    .sort('product')
    .show()
)

## Total Inventory Value by Warehouse

Our next task will be to determine the total value of all products stored at each individual warehouse.

In [0]:
# This solution is more calculation efficient
(
    inventory
    .join(products, 'pid', 'left')
    .withColumn('value', expr('price*units'))
    .groupBy('wid')
    .agg(
        expr('ROUND(SUM(value),2) AS total_value')
    )
    .join(warehouses, 'wid', 'left')
    .select('city', 'total_value')
    .sort('city')

    .show()
)

In [0]:
# this solution is not ideal 
(
    inventory
    .join(products, 'pid', 'left')
    .join(warehouses, 'wid', 'left')
    .withColumn('value', expr('units * price'))
    .groupBy('city')
    .agg(
        expr('ROUND(SUM(value), 2) AS value')
    )
    .sort('city')
    .show()
)

## Total Inventory Value by Region

We will now determine the total value of all products stored at warehouses within each region.

In [0]:
warehouses.show()

In [0]:
(
    inventory
    .join(warehouses, 'wid', 'left')
    .groupBy('region', 'pid')
    .agg(
        expr('SUM(units) AS units')
    )
    .join(products, 'pid', 'left')
    .withColumn('value', expr('price*units'))
    .groupBy('region')
    .agg(
        expr('ROUND(SUM(value),2) AS value')
    )
    .sort('region')
    .show()
)

In [0]:
(
    inventory
    .join(products, 'pid', 'left')
    .join(warehouses, 'wid', 'left')
    .withColumn('value', expr('units * price'))
    .groupBy('region')
    .agg(
        expr('ROUND(SUM(value), 2) AS value')
    )
    .sort('region')
    .show()
)

## Total Inventory by Region and Product

In the next example, we will determine number of units and the total value for each region/product pair.

In [0]:
(
    inventory
    .join(warehouses, 'wid', 'left')
    .groupBy('pid', 'region')
    .agg(
        expr('SUM(units) AS units')
    )
    .join(products, 'pid', 'left')
    .select(
        'region', 'product', 'units',
        expr('ROUND(units*price, 2) AS value')
    )
    .sort('region', 'product')
    
    .show()
)

In [0]:
(
    inventory
    .join(products, 'pid', 'left')
    .join(warehouses, 'wid', 'left')
    .withColumn('value', expr('units * price'))
    .groupBy('region', 'product')
    .agg(
        expr('SUM(units) AS units'), 
        expr('ROUND(SUM(value), 2) AS value')
    )
    .sort('region', 'product')
    .show()
)

## Products Available in Each City

We will now determine the products that are available within each city.

In [0]:
(
    inventory
    .join(products, 'pid', 'left')
    .join(warehouses, 'wid', 'left')
    .select('city', 'product')
    .sort('city', 'product')
    .show()
)

In [0]:
(
    inventory
    .join(products, 'pid', 'left')
    .join(warehouses, 'wid', 'left')
    .groupBy('city')
    .agg(expr('COLLECT_LIST(product) AS products_avail'))
    .sort('city')
    .show(truncate=False)
)

## Products Not Sold in Each City

For our final task, we will determine which products are unavailable in each of the cities.

In [0]:
(
    warehouses
    .crossJoin(products)
    .join(inventory, ['wid', 'pid'], 'anti')
    .select('city', 'product')
    .sort('city', 'product')
    .show()
)

In [0]:
(
    warehouses
    .crossJoin(products)
    .join(inventory, ['wid', 'pid'], 'anti')
    .groupBy('city')
    .agg(expr('COLLECT_LIST(product) AS products_not_avail'))
    .sort('city')
    .show(truncate=False)
)