
## Advanced PySpark Transformations

Now we will take this another notch up by doing more complex transformations in PySpark

Let's load our data and get started!

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = spark.read.csv('/Volumes/workspace/default/tutorial_files/BigMartSales.csv', header=True, inferSchema=True)

### PIVOT

In [0]:
df.groupBy('Item_Type').pivot('Outlet_Size').agg(avg('Item_MRP'))\
    .sort('Item_Type').limit(10).display()

Item_Type,null,High,Medium,Small
Baking Goods,126.66939891891889,129.20204383561642,126.1785684729064,125.21336363636368
Breads,139.04861666666667,133.75896,140.8610385542169,145.5236507042254
Breakfast,158.6750903225807,147.49058461538462,134.53751111111112,130.56802666666667
Canned,140.65181123595508,135.4427076923077,138.12485069124423,142.29542857142857
Dairy,149.0512677419355,153.50917249999995,148.51217431192666,145.94210101010103
Frozen Foods,137.49448464730293,136.82925,140.55701532846714,137.83854377510033
Fruits and Vegetables,142.57516045845267,145.57287042253515,142.9714702179177,148.31336951219507
Hard Drinks,134.3875333333333,141.9275217391304,142.83769599999994,129.758784
Health and Hygiene,130.55989019607844,135.11098032786884,128.70186470588237,131.83153529411757
Household,147.76930421455944,147.09752233009704,147.71133010380618,153.9654389105058


### When-Otherwise
#### Problem 1: Flag records as Veg or Non Veg based on the `Item_Type` using When-Otherwise

In [0]:
df.withColumn('Veg_Flag', when(col('Item_Type') == 'Meat', 'Non-Veg').otherwise('Veg')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Veg_Flag
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Veg
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Veg
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Non-Veg
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,Veg
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Veg


#### Problem 2: Create a new column to categorize if an item is Veg and under 100 MRP, Veg and over and Non-Veg

In [0]:
df.withColumn('Veg_Expensive', when((col('Item_Type') != 'Meat') & (col("Item_MRP") > 100), 'Veg-Expensive')\
    .when((col('Item_Type') != 'Meat') & (col('Item_MRP') <= 100), 'Veg-Inexpensive')\
        .otherwise('Non-Veg')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Veg_Expensive
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Veg-Expensive
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Veg-Inexpensive
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Non-Veg
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,Veg-Expensive
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Veg-Inexpensive


### JOINs

In [0]:

dataj1 = [('1','Jon','l01'),
          ('2','Robb','l02'),
          ('3','Aarya','l02'),
          ('4','Cersie','l03'),
          ('5','Jamie','l03'),
          ('6','Tyrion','l05'),
          ('7','Daenerys','l05'),
          ('8', 'Margaery', 'l06')] 

schemaj1 = 'char_id STRING, char_name STRING, location_id STRING' 

df1 = spark.createDataFrame(dataj1,schemaj1)

dataj2 = [('l01','The Wall'),
          ('l02','Winterfell'),
          ('l03','Kings Landing'),
          ('l04','Beyond'),
          ('l05','Dragonstone')]

schemaj2 = 'location_id STRING, location STRING'

df2 = spark.createDataFrame(dataj2,schemaj2)

In [0]:
df1.display()
df2.display()

char_id,char_name,location_id
1,Jon,l01
2,Rob,l02
3,Aarya,l02
4,Cersie,l03
5,Jamie,l03
6,Tyrion,l05
7,Daenerys,l05
8,Margaery,l06


location_id,location
l01,The Wall
l02,Winterfell
l03,Kings Landing
l04,Beyond
l05,Dragonstone


#### Inner Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'inner').display()

char_id,char_name,location_id,location_id.1,location
1,Jon,l01,l01,The Wall
2,Rob,l02,l02,Winterfell
3,Aarya,l02,l02,Winterfell
4,Cersie,l03,l03,Kings Landing
5,Jamie,l03,l03,Kings Landing
6,Tyrion,l05,l05,Dragonstone
7,Daenerys,l05,l05,Dragonstone


#### Left Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'left').display()

char_id,char_name,location_id,location_id.1,location
1,Jon,l01,l01,The Wall
2,Rob,l02,l02,Winterfell
3,Aarya,l02,l02,Winterfell
4,Cersie,l03,l03,Kings Landing
5,Jamie,l03,l03,Kings Landing
6,Tyrion,l05,l05,Dragonstone
7,Daenerys,l05,l05,Dragonstone
8,Margaery,l06,,


#### Right Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'right').display()

char_id,char_name,location_id,location_id.1,location
1.0,Jon,l01,l01,The Wall
3.0,Aarya,l02,l02,Winterfell
2.0,Rob,l02,l02,Winterfell
5.0,Jamie,l03,l03,Kings Landing
4.0,Cersie,l03,l03,Kings Landing
,,,l04,Beyond
7.0,Daenerys,l05,l05,Dragonstone
6.0,Tyrion,l05,l05,Dragonstone


#### Anti Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'anti').display()

char_id,char_name,location_id
8,Margaery,l06


#### Full Join

In [0]:
df1.join(df2, df1['location_id'] == df2['location_id'], 'full').display()

char_id,char_name,location_id,location_id.1,location
1.0,Jon,l01,l01,The Wall
2.0,Rob,l02,l02,Winterfell
3.0,Aarya,l02,l02,Winterfell
4.0,Cersie,l03,l03,Kings Landing
5.0,Jamie,l03,l03,Kings Landing
6.0,Tyrion,l05,l05,Dragonstone
7.0,Daenerys,l05,l05,Dragonstone
8.0,Margaery,l06,,
,,,l04,Beyond
