In [1]:
import spark_env

spark = spark_env.create_spark_session('pyspark')

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Reading a CSV file

In [4]:
df = spark.read.format('csv')\
                .option('inferSchema', True)\
                .option('header',True)\
                .load('Bigmart Sales.csv')

In [5]:
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

## Reading a JSON File

In [7]:
df_json = spark.read.format('json')\
                .option('inferSchema', True)\
                .option('header', True)\
                .option('multiLine', False)\
                .load('drivers.json')

In [8]:
df_json.show(5)

+----+----------+--------+----------+--------------------+-----------+------+--------------------+
|code|       dob|driverId| driverRef|                name|nationality|number|                 url|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
| HAM|1985-01-07|       1|  hamilton|   {Lewis, Hamilton}|    British|    44|http://en.wikiped...|
| HEI|1977-05-10|       2|  heidfeld|    {Nick, Heidfeld}|     German|    \N|http://en.wikiped...|
| ROS|1985-06-27|       3|   rosberg|     {Nico, Rosberg}|     German|     6|http://en.wikiped...|
| ALO|1981-07-29|       4|    alonso|  {Fernando, Alonso}|    Spanish|    14|http://en.wikiped...|
| KOV|1981-10-19|       5|kovalainen|{Heikki, Kovalainen}|    Finnish|    \N|http://en.wikiped...|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
only showing top 5 rows



## Schema Definition

In [9]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [10]:
custom_Schema = StructType([
    StructField('Item_Identifier', StringType(),True),
    StructField('Item_Weight', StringType(),True),
    StructField('Item_Fat_Content', StringType(),True),
    StructField('Item_Visibility', StringType(),True),
    StructField('Item_Type', StringType(),True),
    StructField('Item_MRP', StringType(),True),
    StructField('Outlet_Identifier', StringType(),True),
    StructField('Outlet_Establishment_Year', StringType(),True),
    StructField('Outlet_Size', StringType(),True),
    StructField('Outlet_Location_Type', StringType(),True),
    StructField('Outlet_Type', StringType(),True),
    StructField('Item_Outlet_Sales', StringType(),True)
])

In [12]:
df_schema = spark.read.format('csv')\
                        .schema(custom_Schema)\
                        .option('header',True)\
                        .load('Bigmart Sales.csv')

In [13]:
df_schema.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)



### SELECT FUNCTION

In [14]:
df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).show(5)

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|          FDA15|        9.3|         Low Fat|
|          DRC01|       5.92|         Regular|
|          FDN15|       17.5|         Low Fat|
|          FDX07|       19.2|         Regular|
|          NCD19|       8.93|         Low Fat|
+---------------+-----------+----------------+
only showing top 5 rows



## Alias Function

In [16]:
df.select(col('Item_Identifier').alias('Item_ID'),col('Item_Weight'),col('Item_Fat_Content')).show(5)

+-------+-----------+----------------+
|Item_ID|Item_Weight|Item_Fat_Content|
+-------+-----------+----------------+
|  FDA15|        9.3|         Low Fat|
|  DRC01|       5.92|         Regular|
|  FDN15|       17.5|         Low Fat|
|  FDX07|       19.2|         Regular|
|  NCD19|       8.93|         Low Fat|
+-------+-----------+----------------+
only showing top 5 rows



## Filter/Where Function

1. Filter the data with fat content = regular
2. Slice the data with item type = Soft drinks and weight < 10
3. Fetch the data with Tier in (Tier 1 or Tier 2) and Outlet Size is Null

In [29]:
# 1. Filter the data with Item_Fat_Content = Regular
df.filter(col('Item_Fat_Content') == 'Regular').show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDX07|       19.2|         Regular|            0.0|Fruits and Vegeta...| 182.095|           OUT010|                     1998|       NULL|              Tier 3|    Gro

In [36]:
# 2. Slice the data with item type = Soft drinks and weight < 10
df.filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10)).show(5)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          DRZ11|       8.85|         Regular|    0.113123893|Soft Drinks|122.5388|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        1609.9044|
|          DRF4

In [41]:
# 3. Fetch the data with Tier in (Tier 1 or Tier 2) and Outlet Size is Null
df.filter((col('Outlet_Location_Type').isin(['Tier 1','Tier 2'])) & 
         (col('Outlet_Size').isNull())).show(5)

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|         Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDH17|       16.2|         Regular|    0.016687114|      Frozen Foods| 96.9726|           OUT045|                     2002|       NULL|              Tier 2|Supermarket Type1|        1076.5986|
|          FDU28|       19.2|         Regular|     0.09444959|      Frozen Foods|187.8214|           OUT017|                     2007|       NULL|              Tier 2|Supermarket Type1

## withColumnRenamed

In [45]:
df.withColumnRenamed('Item_Weight','Item_Wt').show(5)

+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Wt|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|    9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|   5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         

## withColumn

1. Create a new column with a constant value 'new'
2. Multiply the Item_weight and the Item_MRP columns and store it in a new column named 'Cost'
3. Modify the column Item_Fat_Content by replacing 'Low Fat' with 'LF' and 'Regular' with 'R'

In [46]:
df.withColumn('flag',lit('new')).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|        

In [48]:
df = df.withColumn('Cost',round(col('Item_Weight')*col('Item_MRP'),2))
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2323.23|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Med

In [51]:
df = df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Regular','R'))\
        .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Low Fat','LF'))
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDA15|        9.3|              LF|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2323.23|
|          DRC01|       5.92|               R|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Med

## Type Casting

In [53]:
df.withColumn('Item_Weight',col('Item_Weight').cast(DoubleType())).printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- Cost: double (nullable = true)



## Sort/Order By -> Wide Transformation

1. Sort based on Item_Weight in Descending Order
2. Sort based on Item_Visibility in Ascending Order
3. Sort based on Item_Weight and Item_Visibility in Descending Order

In [55]:
df.sort(col('Item_Weight').desc()).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDC02|      21.35|              LF|    0.069102831|              Canned|259.9278|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        6768.5228|5549.46|
|          FDC02|      21.35|              LF|    0.115194717|              Canned|258.3278|           OUT010|                     1998|       N

In [56]:
df.sort(col('Item_Visibility').asc()).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDO23|      17.85|              LF|            0.0|              Breads| 93.1436|           OUT045|                     2002|       NULL|              Tier 2|Supermarket Type1|        2174.5028|1662.61|
|          FDY07|       11.8|              LF|            0.0|Fruits and Vegeta...| 45.5402|           OUT049|                     1999|     Med

In [58]:
df.sort(['Item_Weight','Item_Visibility'], ascending = [0,1]).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDC02|      21.35|              LF|    0.068765205|              Canned|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892|5560.13|
|          FDC02|      21.35|              LF|    0.068809463|              Canned|258.5278|           OUT035|                     2004|      Sm

## Limit

In [59]:
df.limit(2).show()

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDA15|        9.3|              LF|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2323.23|
|          DRC01|       5.92|               R|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|  

## Drop

In [61]:
df.drop(col('Cost')).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|              LF|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|               R|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [62]:
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          FDA15|        9.3|              LF|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2323.23|
|          DRC01|       5.92|               R|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Med

## Drop Duplicates

In [78]:
df.dropDuplicates().show(5)

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|   Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|   Cost|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          NCQ42|      20.35|              LF|    0.039428167|   Household|127.9678|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        3560.6984|2604.14|
|          DRG39|      14.15|              LF|    0.042352822|       Dairy| 51.6982|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Typ