
## Intermediate PySpark Transformations

Now we will take this to the next level by doing a bit more complex transformations in PySpark

Let's load our data and get started!

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = spark.read.csv('/Volumes/workspace/default/tutorial_files/BigMartSales.csv', header=True, inferSchema=True)

In [0]:
df.limit(5).display()
df.count()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


8523

### dropDuplicates() or drop_duplicates()

#### Problem 1: Drop duplicates from df
This will not do much since there are no duplicates in df

In [0]:
df.dropDuplicates().count()

8523

In [0]:
df.drop_duplicates().count()

8523

#### Problem 2: Drop duplicates based on the subest of column `Item_Type`

In [0]:
df.drop_duplicates(subset=['Item_Type']).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
FDO23,17.85,Low Fat,0.0,Breads,93.1436,OUT045,2002,,Tier 2,Supermarket Type1,2174.5028
FDP49,9.0,Regular,0.069088961,Breakfast,56.3614,OUT046,1997,Small,Tier 1,Supermarket Type1,1547.3192
FDC14,,Regular,0.072221801,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
FDH17,16.2,Regular,0.016687114,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
DRI11,,Low Fat,0.034237682,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
NCB42,11.8,Low Fat,0.008596051,Health and Hygiene,115.3492,OUT018,2009,Medium,Tier 3,Supermarket Type2,1621.8888
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


**Note:** `.distinct()` works similar to `drop_duplicates()`


### UNION and UNION BYNAME

In [0]:
data1 = [('1','Jon'),
        ('2','Aarya')]
schema1 = 'id STRING, name STRING' 

df1 = spark.createDataFrame(data1,schema1)

data2 = [('3','Tyrion'),
        ('4','Daenerys')]
schema2 = 'id STRING, name STRING' 

df2 = spark.createDataFrame(data2,schema2)

In [0]:
df1.display()
df2.display()

id,name
1,Jon
2,Aarya


id,name
3,Tyrion
4,Daenerys



#### UNION

In [0]:
df1.union(df2).display()

id,name
1,Jon
2,Aarya
3,Tyrion
4,Daenerys


In [0]:
data1 = [('Jon','1'),
        ('Aarya','2')]
schema1 = 'name STRING, id STRING' 

df1 = spark.createDataFrame(data1,schema1)
df1.display()

name,id
Jon,1
Aarya,2


In [0]:
 df1.union(df2).display()

name,id
Jon,1
Aarya,2
3,Tyrion
4,Daenerys



#### UNION BY NAME

In [0]:
df1.unionByName(df2).display()

name,id
Jon,1
Aarya,2
Tyrion,3
Daenerys,4


### String Functions

#### initcap()

In [0]:
df.select(initcap('Item_Type').alias('Item_Type_Initcap')).limit(5).display()

Item_Type_Initcap
Dairy
Soft Drinks
Meat
Fruits And Vegetables
Household


#### lower()

In [0]:
df.select(lower('Item_Type').alias('Item_Type_Lower')).limit(5).display()

Item_Type_Lower
dairy
soft drinks
meat
fruits and vegetables
household


#### upper()

In [0]:
df.select(upper('Item_Type').alias('Item_Type_Upper')).limit(5).display()

Item_Type_Upper
DAIRY
SOFT DRINKS
MEAT
FRUITS AND VEGETABLES
HOUSEHOLD


### Date Functions

#### current_date()

In [0]:
df = df.withColumn('curr_date', current_date())
df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26


#### date_add()

In [0]:
df = df.withColumn('week_after', date_add('curr_date', 7))
df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03


#### date_sub()

In [0]:
df = df.withColumn('week_before', date_sub('curr_date', 7))
df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,2025-11-19
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,2025-11-19
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,2025-11-19
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03,2025-11-19
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,2025-11-19


Similar results can also be achieved by using `df = df.withColumn('week_before', date_add('curr_date', -7))` which is a standard practice and is encouraged.

#### date_diff()

In [0]:
df = df.withColumn('date_delta', date_diff('week_after', 'curr_date'))
df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,2025-11-19,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,2025-11-19,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,2025-11-19,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03,2025-11-19,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,2025-11-19,7


#### date_format()

In [0]:
df = df.withColumn('week_before', date_format('week_before', 'dd-MM-yyyy'))
df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7


**Spark uses Java's SimpleDateFormat, and it is case-sensitive**

Different letters mean completely different things depending on whether they’re uppercase or lowercase.

So:

- `M (uppercase) → Month`
- `m (lowercase) → Minute`
- `D (uppercase) → Day of Year`
- `d (lowercase) → Day of Month`
- `Y (uppercase) → Week-based-year`
- `y (lowercase) → Regular calendar year`

This is why the case matters.

### Handling Null Values

#### Dropping Null values

`dropna() with 'all'`

This will drop only those records which have null values in all of its columns

It will not affect current df since it has no such record

In [0]:
df.dropna('all').limit(5).display()
df.dropna('all').count()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7


8523

`dropna() with 'any'`

This will drop all the records which has `NULL` in at least one of the columns

In [0]:
df.dropna('any').limit(5).display()
df.dropna('any').count()


Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,2025-11-26,2025-12-03,19-11-2025,7


4650

`dropna() with subset ['column_name1', ...]`

This will drop only those records where we find `NULL` value in the subset of columns

In [0]:
df.dropna(subset=['Outlet_Size']).limit(5).display()
df.dropna(subset=['Outlet_Size']).count()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,2025-11-26,2025-12-03,19-11-2025,7


6113

This can be clubbed as well as follows:
- `df.dropna(how='all', subset=['Outlet_Size', 'Item_Weight'])`
- `df.dropna(how='any', subset=['Outlet_Size', 'Item_Weight'])`

#### Filling NULL Values

This is achieved by using `fillna()` which works similar to `dropna()` and has similar arguments. We will just implement on subset now

In [0]:
df.fillna('NotAvailable', subset=['Outlet_Size']).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,NotAvailable,Tier 3,Grocery Store,732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7


### SPLIT and Indexing

#### SPLIT

In [0]:
df.withColumn('Outlet_Type', split('Outlet_Type', ' ')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,"List(Supermarket, Type2)",443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,"List(Grocery, Store)",732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,"List(Supermarket, Type1)",994.7052,2025-11-26,2025-12-03,19-11-2025,7


#### Indexing

In [0]:
df.withColumn('Outlet',split('Outlet_Type', ' ')[0])\
    .withColumn('Outlet_Type', split('Outlet_Type', ' ')[1])\
        .limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta,Outlet
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7,Supermarket
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7,Supermarket
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7,Supermarket
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Store,732.38,2025-11-26,2025-12-03,19-11-2025,7,Grocery
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7,Supermarket


### Explode

In [0]:
new_df = df.withColumn('Outlet_Type', split('Outlet_Type', ' '))
new_df.limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,"List(Supermarket, Type2)",443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,"List(Grocery, Store)",732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,"List(Supermarket, Type1)",994.7052,2025-11-26,2025-12-03,19-11-2025,7


In [0]:
new_df.withColumn('Outlet_Type', explode('Outlet_Type')).limit(10).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket,3735.138,2025-11-26,2025-12-03,19-11-2025,7
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Type1,3735.138,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket,443.4228,2025-11-26,2025-12-03,19-11-2025,7
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Type2,443.4228,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket,2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Type1,2097.27,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery,732.38,2025-11-26,2025-12-03,19-11-2025,7
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Store,732.38,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket,994.7052,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Type1,994.7052,2025-11-26,2025-12-03,19-11-2025,7



### array_contains()

#### Problem 1: Create a column `Type1_Flag` which is `True` if `Outlet_Type` column has `Type1` else `False

In [0]:
new_df.withColumn('Type1_Flag', array_contains('Outlet_Type', 'Type1')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta,Type1_Flag
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",3735.138,2025-11-26,2025-12-03,19-11-2025,7,True
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,"List(Supermarket, Type2)",443.4228,2025-11-26,2025-12-03,19-11-2025,7,False
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",2097.27,2025-11-26,2025-12-03,19-11-2025,7,True
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,"List(Grocery, Store)",732.38,2025-11-26,2025-12-03,19-11-2025,7,False
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,"List(Supermarket, Type1)",994.7052,2025-11-26,2025-12-03,19-11-2025,7,True


#### Problem 2: Write a query to filter dataframe based on if `Outlet_Type` column has `Type1`

In [0]:
new_df.filter(array_contains('Outlet_Type', 'Type1')).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,curr_date,week_after,week_before,date_delta
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",3735.138,2025-11-26,2025-12-03,19-11-2025,7
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,"List(Supermarket, Type1)",2097.27,2025-11-26,2025-12-03,19-11-2025,7
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,"List(Supermarket, Type1)",994.7052,2025-11-26,2025-12-03,19-11-2025,7
FDO10,13.65,Regular,0.012741089,Snack Foods,57.6588,OUT013,1987,High,Tier 3,"List(Supermarket, Type1)",343.5528,2025-11-26,2025-12-03,19-11-2025,7
FDH17,16.2,Regular,0.016687114,Frozen Foods,96.9726,OUT045,2002,,Tier 2,"List(Supermarket, Type1)",1076.5986,2025-11-26,2025-12-03,19-11-2025,7



### GroupBy

#### Problem 1: Find sum of all `Item_MRP` for items belonging to a particular `Item_Type`

In [0]:
df.groupBy('Item_Type').agg(sum('Item_MRP').alias('sum_MRP')).limit(10).display()

Item_Type,sum_MRP
Canned,90706.7269999999
Meat,59449.86379999996
Starchy Foods,21880.027399999995
Baking Goods,81894.73640000001
Snack Foods,175433.92040000003
Breakfast,15596.6966
Dairy,101276.45959999996
Household,135976.52539999998
Seafood,9077.870000000004
Frozen Foods,118558.8814


#### Problem 2: Do the same but find average MRP and sort by descending value of average MRP

In [0]:
df.groupBy('Item_Type').agg(avg('Item_MRP').alias('avg_MRP'))\
    .sort(col('avg_MRP').desc()).limit(10).display()

Item_Type,avg_MRP
Household,149.42475318681318
Dairy,148.49920762463336
Starchy Foods,147.83802297297294
Snack Foods,146.19493366666669
Fruits and Vegetables,144.58123457792206
Seafood,141.84171875000004
Breakfast,141.78815090909092
Breads,140.9526685258964
Meat,139.88203247058814
Canned,139.76383204930647



#### Problem 3: Find average `Item_MRP` based on `Item_Type` and `Outlet_Size`

In [0]:
df.groupBy('Item_Type', 'Outlet_Size').agg(avg('Item_MRP').alias('avg_MRP'))\
    .sort('Item_Type', 'Outlet_Size').limit(10).display()

Item_Type,Outlet_Size,avg_MRP
Baking Goods,,126.66939891891889
Baking Goods,High,129.20204383561642
Baking Goods,Medium,126.1785684729064
Baking Goods,Small,125.21336363636368
Breads,,139.04861666666667
Breads,High,133.75896
Breads,Medium,140.8610385542169
Breads,Small,145.5236507042254
Breakfast,,158.6750903225807
Breakfast,High,147.49058461538462


#### Problem 4: Do the same as 3 but now also get total MRP

In [0]:
df.groupBy('Item_Type', 'Outlet_Size')\
    .agg(avg('Item_MRP').alias('Avg_MRP'), sum('Item_MRP').alias('Total_MRP'))\
        .sort('Item_Type', 'Outlet_Size').limit(10).display()

Item_Type,Outlet_Size,Avg_MRP,Total_MRP
Baking Goods,,126.66939891891889,23433.838799999998
Baking Goods,High,129.20204383561642,9431.749199999998
Baking Goods,Medium,126.1785684729064,25614.2494
Baking Goods,Small,125.21336363636368,23414.89900000001
Breads,,139.04861666666667,10011.5004
Breads,High,133.75896,3343.974
Breads,Medium,140.8610385542169,11691.4662
Breads,Small,145.5236507042254,10332.179200000002
Breakfast,,158.6750903225807,4918.927800000001
Breakfast,High,147.49058461538462,1917.3776


### collect_list() aggregate function

In [0]:
data = [('user1','book1'),
        ('user1','book2'),
        ('user2','book2'),
        ('user2','book4'),
        ('user3','book1')]

schema = 'user string, book string'

df_book = spark.createDataFrame(data,schema)

df_book.display()

user,book
user1,book1
user1,book2
user2,book2
user2,book4
user3,book1


In [0]:
df_book.groupBy('user').agg(collect_list('book').alias('books')).display()

user,books
user1,"List(book1, book2)"
user2,"List(book2, book4)"
user3,List(book1)
