
## Advanced PySpark Transformations

Now we will take this another notch up by doing more complex transformations in PySpark

Let's load our data and get started!

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = spark.read.csv('/Volumes/workspace/default/tutorial_files/BigMartSales.csv', header=True, inferSchema=True)

### PIVOT

In [0]:
df.groupBy('Item_Type').pivot('Outlet_Size').agg(avg('Item_MRP'))\
    .sort('Item_Type').limit(10).display()

Item_Type,null,High,Medium,Small
Baking Goods,126.66939891891889,129.20204383561642,126.1785684729064,125.21336363636368
Breads,139.04861666666667,133.75896,140.8610385542169,145.5236507042254
Breakfast,158.6750903225807,147.49058461538462,134.53751111111112,130.56802666666667
Canned,140.65181123595508,135.4427076923077,138.12485069124423,142.29542857142857
Dairy,149.0512677419355,153.50917249999995,148.51217431192666,145.94210101010103
Frozen Foods,137.49448464730293,136.82925,140.55701532846714,137.83854377510033
Fruits and Vegetables,142.57516045845267,145.57287042253515,142.9714702179177,148.31336951219507
Hard Drinks,134.3875333333333,141.9275217391304,142.83769599999994,129.758784
Health and Hygiene,130.55989019607844,135.11098032786884,128.70186470588237,131.83153529411757
Household,147.76930421455944,147.09752233009704,147.71133010380618,153.9654389105058


### When-Otherwise
#### Problem 1: Flag records as Veg or Non Veg based on the `Item_Type` using When-Otherwise

In [0]:
df.withColumn('Veg_Flag', when(col('Item_Type') == 'Meat', 'Non-Veg').otherwise('Veg')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Veg_Flag
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Veg
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Veg
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Non-Veg
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,Veg
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Veg


#### Problem 2: Create a new column to categorize if an item is Veg and under 100 MRP, Veg and over and Non-Veg

In [0]:
df.withColumn('Veg_Expensive', when((col('Item_Type') != 'Meat') & (col("Item_MRP") > 100), 'Veg-Expensive')\
    .when((col('Item_Type') != 'Meat') & (col('Item_MRP') <= 100), 'Veg-Inexpensive')\
        .otherwise('Non-Veg')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Veg_Expensive
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Veg-Expensive
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Veg-Inexpensive
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Non-Veg
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,Veg-Expensive
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Veg-Inexpensive


### JOINs

In [0]:

dataj1 = [('1','Jon','l01'),
          ('2','Robb','l02'),
          ('3','Aarya','l02'),
          ('4','Cersie','l03'),
          ('5','Jamie','l03'),
          ('6','Tyrion','l05'),
          ('7','Daenerys','l05'),
          ('8', 'Margaery', 'l06')] 

schemaj1 = 'char_id STRING, char_name STRING, location_id STRING' 

df1 = spark.createDataFrame(dataj1,schemaj1)

dataj2 = [('l01','The Wall'),
          ('l02','Winterfell'),
          ('l03','Kings Landing'),
          ('l04','Beyond'),
          ('l05','Dragonstone')]

schemaj2 = 'location_id STRING, location STRING'

df2 = spark.createDataFrame(dataj2,schemaj2)

In [0]:
df1.display()
df2.display()

char_id,char_name,location_id
1,Jon,l01
2,Robb,l02
3,Aarya,l02
4,Cersie,l03
5,Jamie,l03
6,Tyrion,l05
7,Daenerys,l05
8,Margaery,l06


location_id,location
l01,The Wall
l02,Winterfell
l03,Kings Landing
l04,Beyond
l05,Dragonstone


#### Inner Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'inner').display()

char_id,char_name,location_id,location_id.1,location
1,Jon,l01,l01,The Wall
2,Robb,l02,l02,Winterfell
3,Aarya,l02,l02,Winterfell
4,Cersie,l03,l03,Kings Landing
5,Jamie,l03,l03,Kings Landing
6,Tyrion,l05,l05,Dragonstone
7,Daenerys,l05,l05,Dragonstone


#### Left Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'left').display()

char_id,char_name,location_id,location_id.1,location
1,Jon,l01,l01,The Wall
2,Robb,l02,l02,Winterfell
3,Aarya,l02,l02,Winterfell
4,Cersie,l03,l03,Kings Landing
5,Jamie,l03,l03,Kings Landing
6,Tyrion,l05,l05,Dragonstone
7,Daenerys,l05,l05,Dragonstone
8,Margaery,l06,,


#### Right Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'right').display()

char_id,char_name,location_id,location_id.1,location
1.0,Jon,l01,l01,The Wall
3.0,Aarya,l02,l02,Winterfell
2.0,Robb,l02,l02,Winterfell
5.0,Jamie,l03,l03,Kings Landing
4.0,Cersie,l03,l03,Kings Landing
,,,l04,Beyond
7.0,Daenerys,l05,l05,Dragonstone
6.0,Tyrion,l05,l05,Dragonstone


#### Anti Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'anti').display()

char_id,char_name,location_id
8,Margaery,l06


#### Full Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'full').display()

char_id,char_name,location_id,location_id.1,location
1.0,Jon,l01,l01,The Wall
2.0,Robb,l02,l02,Winterfell
3.0,Aarya,l02,l02,Winterfell
4.0,Cersie,l03,l03,Kings Landing
5.0,Jamie,l03,l03,Kings Landing
6.0,Tyrion,l05,l05,Dragonstone
7.0,Daenerys,l05,l05,Dragonstone
8.0,Margaery,l06,,
,,,l04,Beyond


### Window Functions

#### ROW_NUMBER

In [0]:
from pyspark.sql.window import Window

df.withColumn('Row_Num', row_number().over(Window.orderBy('Item_Identifier'))).limit(5).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Row_Num
DRA12,11.6,LF,0.0,Soft Drinks,141.9154,OUT035,2004,Small,Tier 2,Supermarket Type1,992.7078,1
DRA12,11.6,Low Fat,0.0,Soft Drinks,141.6154,OUT045,2002,,Tier 2,Supermarket Type1,3829.0158,2
DRA12,11.6,Low Fat,0.040911824,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,2552.6772,3
DRA12,11.6,Low Fat,0.041177505,Soft Drinks,140.3154,OUT017,2007,,Tier 2,Supermarket Type1,2552.6772,4
DRA12,11.6,Low Fat,0.041112694,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,850.8924,5



#### RANK

In [0]:
df.withColumn('Rank', rank().over(Window.orderBy('Item_Identifier'))).limit(10).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Rank
DRA12,11.6,Low Fat,0.041112694,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,850.8924,1
DRA12,11.6,Low Fat,0.0,Soft Drinks,141.6154,OUT045,2002,,Tier 2,Supermarket Type1,3829.0158,1
DRA12,11.6,Low Fat,0.040911824,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,2552.6772,1
DRA12,11.6,Low Fat,0.068535039,Soft Drinks,143.0154,OUT010,1998,,Tier 3,Grocery Store,283.6308,1
DRA12,11.6,Low Fat,0.041177505,Soft Drinks,140.3154,OUT017,2007,,Tier 2,Supermarket Type1,2552.6772,1
DRA12,11.6,LF,0.0,Soft Drinks,141.9154,OUT035,2004,Small,Tier 2,Supermarket Type1,992.7078,1
DRA24,19.35,Regular,0.066831682,Soft Drinks,163.8868,OUT010,1998,,Tier 3,Grocery Store,327.5736,7
DRA24,19.35,Regular,0.040154087,Soft Drinks,164.6868,OUT017,2007,,Tier 2,Supermarket Type1,1146.5076,7
DRA24,19.35,Regular,0.039920687,Soft Drinks,163.3868,OUT035,2004,Small,Tier 2,Supermarket Type1,3439.5228,7
DRA24,,Regular,0.069909188,Soft Drinks,163.2868,OUT019,1985,Small,Tier 1,Grocery Store,491.3604,7


#### DENSE_RANK

In [0]:
df.withColumn('Dense_Rank', dense_rank().over(Window.orderBy('Item_Identifier'))).limit(10).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Dense_Rank
DRA12,11.6,Low Fat,0.041112694,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,850.8924,1
DRA12,11.6,Low Fat,0.0,Soft Drinks,141.6154,OUT045,2002,,Tier 2,Supermarket Type1,3829.0158,1
DRA12,11.6,Low Fat,0.040911824,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,2552.6772,1
DRA12,11.6,Low Fat,0.068535039,Soft Drinks,143.0154,OUT010,1998,,Tier 3,Grocery Store,283.6308,1
DRA12,11.6,Low Fat,0.041177505,Soft Drinks,140.3154,OUT017,2007,,Tier 2,Supermarket Type1,2552.6772,1
DRA12,11.6,LF,0.0,Soft Drinks,141.9154,OUT035,2004,Small,Tier 2,Supermarket Type1,992.7078,1
DRA24,19.35,Regular,0.066831682,Soft Drinks,163.8868,OUT010,1998,,Tier 3,Grocery Store,327.5736,2
DRA24,19.35,Regular,0.040154087,Soft Drinks,164.6868,OUT017,2007,,Tier 2,Supermarket Type1,1146.5076,2
DRA24,19.35,Regular,0.039920687,Soft Drinks,163.3868,OUT035,2004,Small,Tier 2,Supermarket Type1,3439.5228,2
DRA24,,Regular,0.069909188,Soft Drinks,163.2868,OUT019,1985,Small,Tier 1,Grocery Store,491.3604,2


#### Cummulative_Sum

In [0]:
df.withColumn('Cum_Sum', sum('Item_MRP').over(Window.orderBy('Item_Type'))).limit(5).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Cum_Sum
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,81894.73640000001
FDW12,,Regular,0.035399923,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432,81894.73640000001
FDC37,,Low Fat,0.057556998,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876,81894.73640000001
FDL12,15.85,Regular,0.121632721,Baking Goods,60.622,OUT046,1997,Small,Tier 1,Supermarket Type1,2576.646,81894.73640000001
FDL12,15.85,Regular,0.121531501,Baking Goods,59.222,OUT013,1987,High,Tier 3,Supermarket Type1,599.22,81894.73640000001


In [0]:
df.withColumn('Cum_Sum', sum('Item_MRP').over(Window.orderBy('Item_Type').rowsBetween(Window.unboundedPreceding, Window.currentRow))).limit(5).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Cum_Sum
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,51.4008
FDW12,,Regular,0.035399923,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432,195.9452
FDC37,,Low Fat,0.057556998,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876,303.639
FDL12,15.85,Regular,0.121632721,Baking Goods,60.622,OUT046,1997,Small,Tier 1,Supermarket Type1,2576.646,364.261
FDL12,15.85,Regular,0.121531501,Baking Goods,59.222,OUT013,1987,High,Tier 3,Supermarket Type1,599.22,423.483


In [0]:
df.withColumn('Cum_Sum', sum('Item_MRP').over(Window.orderBy('Item_Type').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))).limit(5).display()



Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Cum_Sum
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,1201681.480800002
FDW12,,Regular,0.035399923,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432,1201681.480800002
FDC37,,Low Fat,0.057556998,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876,1201681.480800002
FDL12,15.85,Regular,0.121632721,Baking Goods,60.622,OUT046,1997,Small,Tier 1,Supermarket Type1,2576.646,1201681.480800002
FDL12,15.85,Regular,0.121531501,Baking Goods,59.222,OUT013,1987,High,Tier 3,Supermarket Type1,599.22,1201681.480800002



### USER DEFINED FUNCTIONS

PS: Using UDFs is not encouraged as it requires allocating a python interepretor in Executors which normally function on JVM which can slow down execution a lot!

**Step 1:** Define the function in python

In [0]:
def my_func(x):
    return x*x

**Step 2:** Convert python function to pyspark function

In [0]:
my_udf = udf(my_func)


**Step 3:** Execute like a normal pyspark function

In [0]:
df.withColumn('Squared_MRP', my_udf('Item_MRP')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Squared_MRP
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,62404.636404640005
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2329.91566864
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,20055.657924
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,33158.589025
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2901.0504099600003
