
## Intermediate PySpark Transformations

Now we will take this to the next level by doing a bit more complex transformations in PySpark

Let's load our data and get started!

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = spark.read.csv('/Volumes/workspace/default/tutorial_files/BigMartSales.csv', header=True, inferSchema=True)

In [0]:
df.limit(5).display()
df.count()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


8523

### dropDuplicates() or drop_duplicates()

#### Problem 1: Drop duplicates from df
This will not do much since there are no duplicates in df

In [0]:
df.dropDuplicates().count()

8523

In [0]:
df.drop_duplicates().count()

8523

#### Problem 2: Drop duplicates based on the subest of column `Item_Type`

In [0]:
df.drop_duplicates(subset=['Item_Type']).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
FDO23,17.85,Low Fat,0.0,Breads,93.1436,OUT045,2002,,Tier 2,Supermarket Type1,2174.5028
FDP49,9.0,Regular,0.069088961,Breakfast,56.3614,OUT046,1997,Small,Tier 1,Supermarket Type1,1547.3192
FDC14,,Regular,0.072221801,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
FDH17,16.2,Regular,0.016687114,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
DRI11,,Low Fat,0.034237682,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
NCB42,11.8,Low Fat,0.008596051,Health and Hygiene,115.3492,OUT018,2009,Medium,Tier 3,Supermarket Type2,1621.8888
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


**Note:** `.distinct()` works similar to `drop_duplicates()`


### UNION and UNION BYNAME

In [0]:
data1 = [('1','Jon'),
        ('2','Aarya')]
schema1 = 'id STRING, name STRING' 

df1 = spark.createDataFrame(data1,schema1)

data2 = [('3','Tyrion'),
        ('4','Daenerys')]
schema2 = 'id STRING, name STRING' 

df2 = spark.createDataFrame(data2,schema2)

In [0]:
df1.display()
df2.display()

id,name
1,Jon
2,Aarya


id,name
3,Tyrion
4,Daenerys



#### UNION

In [0]:
df1.union(df2).display()

id,name
1,Jon
2,Aarya
3,Tyrion
4,Daenerys


In [0]:
data1 = [('Jon','1'),
        ('Aarya','2')]
schema1 = 'name STRING, id STRING' 

df1 = spark.createDataFrame(data1,schema1)
df1.display()

name,id
Jon,1
Aarya,2


In [0]:
 df1.union(df2).display()

name,id
Jon,1
Aarya,2
3,Tyrion
4,Daenerys



#### UNION BY NAME

In [0]:
df1.unionByName(df2).display()

name,id
Jon,1
Aarya,2
Tyrion,3
Daenerys,4
