In [1]:
import spark_env

spark = spark_env.create_spark_session('pyspark_advanced_functions')

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
df = spark.read.format('csv')\
                .option('header',True)\
                .option('inferSchema',True)\
                .load('BigMart Sales.csv')

In [8]:
pivot_df_sample = df.select(col('Item_Type'),col('Outlet_Size'),col('Item_MRP')).dropna(subset=['Outlet_Size'])
pivot_df_sample.show(5)

+------------+-----------+--------+
|   Item_Type|Outlet_Size|Item_MRP|
+------------+-----------+--------+
|       Dairy|     Medium|249.8092|
| Soft Drinks|     Medium| 48.2692|
|        Meat|     Medium| 141.618|
|   Household|       High| 53.8614|
|Baking Goods|     Medium| 51.4008|
+------------+-----------+--------+
only showing top 5 rows



## Pivot

In [10]:
pivot_df_sample.groupBy('Item_Type').pivot('Outlet_Size').agg(round(avg('Item_MRP'),2)).show(5)

+--------------------+------+------+------+
|           Item_Type|  High|Medium| Small|
+--------------------+------+------+------+
|       Starchy Foods|158.16|148.42|150.27|
|        Baking Goods| 129.2|126.18|125.21|
|              Breads|133.76|140.86|145.52|
|Fruits and Vegeta...|145.57|142.97|148.31|
|                Meat|137.24|136.42| 145.7|
+--------------------+------+------+------+
only showing top 5 rows



## When-Otherwise

1. Create a column that classifies a row into 'Veg' or 'Non-veg' based on the item in the 'Item_Type' column
2. Create a column in which we will identify if the item is vegetrian, if yes and the mrp is more than 100 then expensive-veg else non-expensive.

In [13]:
df = df.withColumn('cuisine-flag',when(col('Item_type')=='Meat','Non-Vegeterian').otherwise('Vegeterian'))
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|  cuisine-flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|    Vegeterian|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|      

In [15]:
df.withColumn('Vegeterian_expense_flag',when((col('cuisine-flag')=='Vegeterian') & (col('Item_MRP')<100),'Inexpensive')\
                    .when((col('cuisine-flag')=='Vegeterian') & (col('Item_MRP')>100),'Expensive')\
                    .otherwise('Non-Veg')).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------------+-----------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|  cuisine-flag|Vegeterian_expense_flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------------+-----------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|    Vegeterian|              Expensive|
|          DRC01|   