
# Loading data which is in csv format

In [10]:
from pyspark.sql import SparkSession


In [11]:
spark = SparkSession.builder.appName("MyPractice").getOrCreate()

In [12]:
df = spark.read.format('csv').option('inferschema',True).option('header',True).load('/content/sample_data/BigMart Sales.csv')

In [13]:
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

# Loading data which is in json format

In [14]:
df_json = spark.read.format('json').option('inferSchema',True).option('header',True).option('multiline',False).load('/content/sample_data/drivers.json')

In [15]:
df_json .show()

+----+----------+--------+----------+--------------------+-----------+------+--------------------+
|code|       dob|driverId| driverRef|                name|nationality|number|                 url|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
| HAM|1985-01-07|       1|  hamilton|   {Lewis, Hamilton}|    British|    44|http://en.wikiped...|
| HEI|1977-05-10|       2|  heidfeld|    {Nick, Heidfeld}|     German|    \N|http://en.wikiped...|
| ROS|1985-06-27|       3|   rosberg|     {Nico, Rosberg}|     German|     6|http://en.wikiped...|
| ALO|1981-07-29|       4|    alonso|  {Fernando, Alonso}|    Spanish|    14|http://en.wikiped...|
| KOV|1981-10-19|       5|kovalainen|{Heikki, Kovalainen}|    Finnish|    \N|http://en.wikiped...|
| NAK|1985-01-11|       6|  nakajima|  {Kazuki, Nakajima}|   Japanese|    \N|http://en.wikiped...|
| BOU|1979-02-28|       7|  bourdais|{Sébastien, Bourd...|     French|    \N|http://en.wikiped...|
| RAI|1979

In [16]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [17]:
df_json.printSchema()

root
 |-- code: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- driverId: long (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- number: string (nullable = true)
 |-- url: string (nullable = true)




# Structype and Structfield

In [18]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [19]:
my_schema = StructType([
    StructField('code', StringType(), True),
    StructField('dob', StringType(), True),
    StructField('driverId', LongType(), True),
    StructField('driverRef', StringType(), True),
    StructField('name', StructType([
        StructField('family', StringType(), True)
    ]), True),
    StructField('nationality', StringType(), True),
    StructField('number', StringType(), True),
    StructField('url', StringType(), True)
])

# Selecting data

In [20]:
df_select = df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content'))
df_select .show()

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|          FDA15|        9.3|         Low Fat|
|          DRC01|       5.92|         Regular|
|          FDN15|       17.5|         Low Fat|
|          FDX07|       19.2|         Regular|
|          NCD19|       8.93|         Low Fat|
|          FDP36|     10.395|         Regular|
|          FDO10|      13.65|         Regular|
|          FDP10|       NULL|         Low Fat|
|          FDH17|       16.2|         Regular|
|          FDU28|       19.2|         Regular|
|          FDY07|       11.8|         Low Fat|
|          FDA03|       18.5|         Regular|
|          FDX32|       15.1|         Regular|
|          FDS46|       17.6|         Regular|
|          FDF32|      16.35|         Low Fat|
|          FDP49|        9.0|         Regular|
|          NCB42|       11.8|         Low Fat|
|          FDP49|        9.0|         Regular|
|          DR


# Alias

In [21]:
df.select(col('Item_Identifier').alias('Item_id')) .show()

+-------+
|Item_id|
+-------+
|  FDA15|
|  DRC01|
|  FDN15|
|  FDX07|
|  NCD19|
|  FDP36|
|  FDO10|
|  FDP10|
|  FDH17|
|  FDU28|
|  FDY07|
|  FDA03|
|  FDX32|
|  FDS46|
|  FDF32|
|  FDP49|
|  NCB42|
|  FDP49|
|  DRI11|
|  FDU02|
+-------+
only showing top 20 rows




# Filtering


## filter data where Item_fat = Regular

In [22]:
df.filter(col('Item_Fat_content') == 'Regular') .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDX07|       19.2|         Regular|            0.0|Fruits and Vegeta...| 182.095|           OUT010|                     1998|       NULL|              Tier 3|    Gro


## filter where Tier = 1 or 2 and outlet size is  null

In [23]:
df.filter((col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDH17|       16.2|         Regular|    0.016687114|        Frozen Foods| 96.9726|           OUT045|                     2002|       NULL|              Tier 2|Supermarket Type1|        1076.5986|
|          FDU28|       19.2|         Regular|     0.09444959|        Frozen Foods|187.8214|           OUT017|                     2007|       NULL|              Tier 2|Superma


# with column renamed

In [24]:
df.withColumnRenamed('Item_Weight','Item_wg') .show()

+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_wg|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|    9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|   5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         


#  with Column function

In [25]:
df = df.withColumn('flag',lit('new'))
df .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|        

In [26]:
df = df.withColumn('multiplied',col('Item_Weight')*col('Item_MRP'))
df .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|2323.2255600000003|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drin

In [27]:
df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Regular','Reg')) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|2323.2255600000003|
|          DRC01|       5.92|             Reg|    0.019278216|         Soft Drin


# TypeCasting

In [28]:
df = df.withColumn('Item_Weight',col('Item_Weight').cast(FloatType()))
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: float (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- flag: string (nullable = false)
 |-- multiplied: double (nullable = true)



In [29]:
df.sort(col('Item_Weight').desc()) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDC02|      21.35|         Low Fat|    0.068809463|              Canned|258.5278|           OUT035|                     2004|      Small|              Tier 2|Supermarket Type1|         5206.556| new|5519.5685300000005|
|          FDC02|      21.35|         Low Fat|    0.069102831|              Cann

In [30]:
df.sort(['Item_Weight','Item_Visibility'],ascending=[0,1]).limit(10) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDC02|      21.35|         Low Fat|    0.068765205|              Canned|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892| new|        5560.13353|
|          FDC02|      21.35|         Low Fat|    0.068809463|              Cann


#Drop

In [31]:
df.drop('Outlet_Type') .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+----+------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|         3735.138| new|2323.2255600000003|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|    

In [32]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: float (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- flag: string (nullable = false)
 |-- multiplied: double (nullable = true)




# Drop Duplicates

In [33]:
df.drop_duplicates() .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDW28|      18.25|         Low Fat|            0.0|        Frozen Foods|196.8452|           OUT046|                     1997|      Small|              Tier 1|Supermarket Type1|        3327.6684| new|         3592.4249|
|          DRE27|      11.85|         Low Fat|     0.13267058|               Dai

In [34]:
df.drop_duplicates(subset =['Item_Type']) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+
|          FDP36|     10.395|         Regular|            0.0|        Baking Goods| 51.4008|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         556.6088| new| 534.3113159999999|
|          FDO23|      17.85|         Low Fat|            0.0|              Brea


# Union and Union byName

In [35]:
data1 = [('1','kad'),('2','sid')]
schema1 = 'id String , name String'
df1 = spark.createDataFrame(data = data1,schema = schema1)
data2 =[('3','Hello'),('4','World')]
df2 = spark.createDataFrame(data = data2,schema = schema1)


In [36]:
df1.union(df2) .show()

+---+-----+
| id| name|
+---+-----+
|  1|  kad|
|  2|  sid|
|  3|Hello|
|  4|World|
+---+-----+




When both columns have different schema we use unionByName


# String Functions

In [37]:
df.select(initcap('Item_Type')) .show()

+--------------------+
|  initcap(Item_Type)|
+--------------------+
|               Dairy|
|         Soft Drinks|
|                Meat|
|Fruits And Vegeta...|
|           Household|
|        Baking Goods|
|         Snack Foods|
|         Snack Foods|
|        Frozen Foods|
|        Frozen Foods|
|Fruits And Vegeta...|
|               Dairy|
|Fruits And Vegeta...|
|         Snack Foods|
|Fruits And Vegeta...|
|           Breakfast|
|  Health And Hygiene|
|           Breakfast|
|         Hard Drinks|
|               Dairy|
+--------------------+
only showing top 20 rows



In [38]:
df.select(upper('Item_Type')) .show()

+--------------------+
|    upper(Item_Type)|
+--------------------+
|               DAIRY|
|         SOFT DRINKS|
|                MEAT|
|FRUITS AND VEGETA...|
|           HOUSEHOLD|
|        BAKING GOODS|
|         SNACK FOODS|
|         SNACK FOODS|
|        FROZEN FOODS|
|        FROZEN FOODS|
|FRUITS AND VEGETA...|
|               DAIRY|
|FRUITS AND VEGETA...|
|         SNACK FOODS|
|FRUITS AND VEGETA...|
|           BREAKFAST|
|  HEALTH AND HYGIENE|
|           BREAKFAST|
|         HARD DRINKS|
|               DAIRY|
+--------------------+
only showing top 20 rows



In [39]:
df.select(lower('Item_Type')) .show()

+--------------------+
|    lower(Item_Type)|
+--------------------+
|               dairy|
|         soft drinks|
|                meat|
|fruits and vegeta...|
|           household|
|        baking goods|
|         snack foods|
|         snack foods|
|        frozen foods|
|        frozen foods|
|fruits and vegeta...|
|               dairy|
|fruits and vegeta...|
|         snack foods|
|fruits and vegeta...|
|           breakfast|
|  health and hygiene|
|           breakfast|
|         hard drinks|
|               dairy|
+--------------------+
only showing top 20 rows




# Date Functions

In [40]:
df = df.withColumn('curr_date',current_date())
df .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|2323.2255600000003|2025-06-17|
|          DRC01|       5.92|       

In [41]:
df = df.withColumn('a_week_after',date_add('curr_date',7))
df .show()
df = df.withColumn('a_week_before',date_sub('curr_date',7))
df .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|2323.2255600000003|2025-06-1


# Date_Format

In [42]:
df = df.withColumn('curr_date',date_format('curr_date','dd-MM-yyyy'))
df.limit(10) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         


# Handling null values

In [43]:
df.dropna('all') .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         

In [44]:
df.dropna(subset='Outlet_Size') .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         


# Fill Na

In [45]:
df.fillna('Not Available') .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|  Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|       Medium|              Tier 1|Supermarket Type1| 

In [46]:
df.fillna('Not Available',subset=['Outlet_Size']) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|  Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-------------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|       Medium|              Tier 1|Supermarket Type1| 


# Splitting and Indexing

In [47]:
df.withColumn('Outlet_Type',split('Outlet_Type'," ")) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|         Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|[Supermarket, Type


# Explode

In [48]:
exp = df.withColumn('Outlet_Type',split('Outlet_Type'," "))

In [49]:
new_exp =exp.withColumn('Outlet_Type',explode('Outlet_Type'))
new_exp .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+----+------------------+----------+------------+-------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+----+------------------+----------+------------+-------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket|         3735.138| new|2323.22556

# Array Contains

In [50]:
exp.withColumn('Type_1_flag',array_contains('Outlet_Type','Type1')) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----+------------------+----------+------------+-------------+-----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|         Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|Type_1_flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----+------------------+----------+------------+-------------+-----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|   


# Group by

In [51]:
df.groupBy('Item_Type').agg(sum('Item_MRP')) .show()

+--------------------+------------------+
|           Item_Type|     sum(Item_MRP)|
+--------------------+------------------+
|       Starchy Foods|21880.027399999995|
|        Baking Goods| 81894.73640000001|
|              Breads| 35379.11979999999|
|Fruits and Vegeta...|178124.08099999998|
|                Meat|59449.863799999956|
|         Hard Drinks|29334.676599999995|
|         Soft Drinks|58514.164999999964|
|           Household|135976.52539999998|
|           Breakfast|        15596.6966|
|               Dairy|101276.45959999996|
|         Snack Foods|175433.92040000003|
|              Others|22451.891600000006|
|             Seafood| 9077.870000000003|
|              Canned|  90706.7269999999|
|        Frozen Foods|118558.88140000001|
|  Health and Hygiene|        68025.8388|
+--------------------+------------------+



In [52]:
df.groupBy('Item_Type').agg(avg('Item_MRP')) .show()

+--------------------+------------------+
|           Item_Type|     avg(Item_MRP)|
+--------------------+------------------+
|       Starchy Foods|147.83802297297294|
|        Baking Goods|126.38076604938273|
|              Breads| 140.9526685258964|
|Fruits and Vegeta...|144.58123457792206|
|                Meat|139.88203247058814|
|         Hard Drinks|137.07792803738315|
|         Soft Drinks|131.49250561797746|
|           Household|149.42475318681318|
|           Breakfast|141.78815090909092|
|               Dairy|148.49920762463336|
|         Snack Foods|146.19493366666669|
|              Others|132.85142958579885|
|             Seafood|141.84171875000004|
|              Canned|139.76383204930647|
|        Frozen Foods|138.50336612149533|
|  Health and Hygiene|130.81892076923077|
+--------------------+------------------+



In [53]:
df.groupBy('Item_Type','Outlet_Size').agg(avg('Item_MRP').alias('Average')) .show()

+--------------------+-----------+------------------+
|           Item_Type|Outlet_Size|           Average|
+--------------------+-----------+------------------+
|       Starchy Foods|     Medium| 148.4195041666666|
|Fruits and Vegeta...|     Medium| 142.9714702179177|
|       Starchy Foods|       NULL|140.48000465116277|
|              Breads|       NULL|139.04861666666667|
|        Baking Goods|       NULL|126.66939891891889|
|Fruits and Vegeta...|       NULL|142.57516045845267|
|        Frozen Foods|       High|         136.82925|
|         Soft Drinks|       High|131.75847346938772|
|           Breakfast|      Small|130.56802666666667|
|                Meat|     Medium|136.41913154362408|
|Fruits and Vegeta...|       High|145.57287042253515|
|                Meat|       High| 137.2447902439025|
|        Baking Goods|       High|129.20204383561642|
|           Household|     Medium|147.71133010380618|
|                Meat|       NULL|139.29453448275865|
|         Hard Drinks|      

In [54]:
df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP'),avg('Item_MRP')) .show()

+--------------------+-----------+------------------+------------------+
|           Item_Type|Outlet_Size|     sum(Item_MRP)|     avg(Item_MRP)|
+--------------------+-----------+------------------+------------------+
|       Starchy Foods|     Medium| 7124.136199999997| 148.4195041666666|
|Fruits and Vegeta...|     Medium|59047.217200000014| 142.9714702179177|
|       Starchy Foods|       NULL|         6040.6402|140.48000465116277|
|              Breads|       NULL|        10011.5004|139.04861666666667|
|        Baking Goods|       NULL|23433.838799999994|126.66939891891889|
|Fruits and Vegeta...|       NULL|49758.730999999985|142.57516045845267|
|        Frozen Foods|       High|12588.291000000001|         136.82925|
|         Soft Drinks|       High| 6456.165199999999|131.75847346938772|
|           Breakfast|      Small|3917.0407999999998|130.56802666666667|
|                Meat|     Medium| 20326.45059999999|136.41913154362408|
|Fruits and Vegeta...|       High| 20671.3475999999


# Collect List


In [55]:
data = [('user1','book1'),
        ('user1','book2'),
        ('user2','book2'),
        ('user2','book4'),
        ('user3','book1'),
        ('user3','book2')]
schema = 'user string,book string'
books = spark.createDataFrame(data,schema)
books .show()

+-----+-----+
| user| book|
+-----+-----+
|user1|book1|
|user1|book2|
|user2|book2|
|user2|book4|
|user3|book1|
|user3|book2|
+-----+-----+



# Pivot

In [56]:
df.groupBy('Outlet_Size').pivot('Item_type').agg(avg('Item_MRP')) .show()

+-----------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|Outlet_Size|      Baking Goods|            Breads|         Breakfast|            Canned|             Dairy|      Frozen Foods|Fruits and Vegetables|       Hard Drinks|Health and Hygiene|         Household|              Meat|            Others|           Seafood|       Snack Foods|       Soft Drinks|     Starchy Foods|
+-----------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|       High|129.20204383561642|     

# When-Otherwise

In [57]:
nv = df.withColumn('Non_Veg',when(col('Item_Type')=='Meat','True').otherwise('False'))
nv .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|Non_Veg|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Sup

In [58]:
nv.withColumn('Non_Veg_exp_Flag',when(((col('Non_Veg')=='True')&(col('Item_MRP')<100)),'Non Veg Not Expensive').otherwise('Non Veg Expensive')) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|Non_Veg| Non_Veg_exp_Flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|        

In [59]:
dataj1 = [('1','gaur','d01'),
          ('2','kit','d02'),
          ('3','sam','d03'),
          ('4','tim','d03'),
          ('5','aman','d05'),
          ('6','nad','d06')]

schemaj1 = 'emp_id STRING, emp_name STRING, dept_id STRING'

df1 = spark.createDataFrame(dataj1,schemaj1)

dataj2 = [('d01','HR'),
          ('d02','Marketing'),
          ('d03','Accounts'),
          ('d04','IT'),
          ('d05','Finance')]

schemaj2 = 'dept_id STRING, department STRING'

df2 = spark.createDataFrame(dataj2,schemaj2)

In [60]:
df1 .show()
df2 .show()

+------+--------+-------+
|emp_id|emp_name|dept_id|
+------+--------+-------+
|     1|    gaur|    d01|
|     2|     kit|    d02|
|     3|     sam|    d03|
|     4|     tim|    d03|
|     5|    aman|    d05|
|     6|     nad|    d06|
+------+--------+-------+

+-------+----------+
|dept_id|department|
+-------+----------+
|    d01|        HR|
|    d02| Marketing|
|    d03|  Accounts|
|    d04|        IT|
|    d05|   Finance|
+-------+----------+



# Joins

## Inner and outer

In [61]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'inner') .show()

df1.join(df2,df1['dept_id']==df2['dept_id'],'outer') .show()




+------+--------+-------+-------+----------+
|emp_id|emp_name|dept_id|dept_id|department|
+------+--------+-------+-------+----------+
|     1|    gaur|    d01|    d01|        HR|
|     2|     kit|    d02|    d02| Marketing|
|     3|     sam|    d03|    d03|  Accounts|
|     4|     tim|    d03|    d03|  Accounts|
|     5|    aman|    d05|    d05|   Finance|
+------+--------+-------+-------+----------+

+------+--------+-------+-------+----------+
|emp_id|emp_name|dept_id|dept_id|department|
+------+--------+-------+-------+----------+
|     1|    gaur|    d01|    d01|        HR|
|     2|     kit|    d02|    d02| Marketing|
|     3|     sam|    d03|    d03|  Accounts|
|     4|     tim|    d03|    d03|  Accounts|
|  NULL|    NULL|   NULL|    d04|        IT|
|     5|    aman|    d05|    d05|   Finance|
|     6|     nad|    d06|   NULL|      NULL|
+------+--------+-------+-------+----------+



## Right and Left

In [62]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'left') .show()
df1.join(df2,df1['dept_id']==df2['dept_id'],'right') .show()

+------+--------+-------+-------+----------+
|emp_id|emp_name|dept_id|dept_id|department|
+------+--------+-------+-------+----------+
|     1|    gaur|    d01|    d01|        HR|
|     3|     sam|    d03|    d03|  Accounts|
|     2|     kit|    d02|    d02| Marketing|
|     4|     tim|    d03|    d03|  Accounts|
|     6|     nad|    d06|   NULL|      NULL|
|     5|    aman|    d05|    d05|   Finance|
+------+--------+-------+-------+----------+

+------+--------+-------+-------+----------+
|emp_id|emp_name|dept_id|dept_id|department|
+------+--------+-------+-------+----------+
|     1|    gaur|    d01|    d01|        HR|
|     2|     kit|    d02|    d02| Marketing|
|     4|     tim|    d03|    d03|  Accounts|
|     3|     sam|    d03|    d03|  Accounts|
|  NULL|    NULL|   NULL|    d04|        IT|
|     5|    aman|    d05|    d05|   Finance|
+------+--------+-------+-------+----------+



## Left anti and semi

In [63]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'leftanti') .show()
df1.join(df2,df1['dept_id']==df2['dept_id'],'leftsemi') .show()

+------+--------+-------+
|emp_id|emp_name|dept_id|
+------+--------+-------+
|     6|     nad|    d06|
+------+--------+-------+

+------+--------+-------+
|emp_id|emp_name|dept_id|
+------+--------+-------+
|     1|    gaur|    d01|
|     2|     kit|    d02|
|     3|     sam|    d03|
|     4|     tim|    d03|
|     5|    aman|    d05|
+------+--------+-------+



## Cross Join

In [64]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'cross') .show()

+------+--------+-------+-------+----------+
|emp_id|emp_name|dept_id|dept_id|department|
+------+--------+-------+-------+----------+
|     1|    gaur|    d01|    d01|        HR|
|     2|     kit|    d02|    d02| Marketing|
|     3|     sam|    d03|    d03|  Accounts|
|     4|     tim|    d03|    d03|  Accounts|
|     5|    aman|    d05|    d05|   Finance|
+------+--------+-------+-------+----------+




# Window Functions

In [65]:
from pyspark.sql.window import Window

## Row Number

In [66]:
df.withColumn('row_number',row_number().over(Window.orderBy(col('Item_MRP').desc()))) .show()

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|         Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|row_number|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+----------+
|          FDS13|      6.465|         Low Fat|    0.125210375|            Canned|266.8884|           OUT017|                     2007|       NULL|              Tier 2|Su

## Rank

In [67]:
df.withColumn('rank_number',row_number().over(Window.orderBy(col('Item_MRP').desc()))) .show()

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|         Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|rank_number|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-----------+
|          FDS13|      6.465|         Low Fat|    0.125210375|            Canned|266.8884|           OUT017|                     2007|       NULL|              Tier 2

## Dense_rank

In [68]:
df.withColumn('dense_rank',row_number().over(Window.orderBy(col('Item_MRP').desc()))) .show()

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|         Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|dense_rank|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+----------+
|          FDS13|      6.465|         Low Fat|    0.125210375|            Canned|266.8884|           OUT017|                     2007|       NULL|              Tier 2|Su

## Cumulative Sum

In [69]:
df.withColumn('Cum_sum',sum('Item_MRP').over(Window.orderBy('Item_Type'))) .show()

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|   Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|          Cum_sum|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+-----------------+
|          FDP36|     10.395|         Regular|            0.0|Baking Goods| 51.4008|           OUT018|                     2009|     Medium|              Tier 3|Super

In [70]:
df.withColumn('Cum_sum',sum('Item_MRP').over(Window.orderBy('Item_Type').rowsBetween(Window.unboundedPreceding,Window.currentRow))) .show()

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|   Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|           Cum_sum|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+------------------+
|          FDP36|     10.395|         Regular|            0.0|Baking Goods| 51.4008|           OUT018|                     2009|     Medium|              Tier 3|Su


# User Defined Functions

In [71]:
def square(x):
  return x*x
from pyspark.sql.functions import udf
square_udf = udf(square)

In [72]:
df.withColumn('square',square_udf('Item_MRP')) .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|        multiplied| curr_date|a_week_after|a_week_before|            square|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+------------------+----------+------------+-------------+------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|   


# Data Writing and Modes

In [73]:
df.write.format('csv').save('/Volumes/ninad/default/ninad/data.csv')

## Append

In [74]:
df.write.format('csv').mode('append').save('/Volumes/ninad/default/ninad/data.csv')

## OverWrite

In [75]:
df.write.format('csv').mode('overwrite').save('/Volumes/ninad/default/ninad/data.csv')

## Error

In [76]:
df.write.format('csv').mode('error').save('/Volumes/ninad/default/ninad/data.csv')

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/Volumes/ninad/default/ninad/data.csv already exists. Set mode as "overwrite" to overwrite the existing path.

## Ignore

In [None]:
df.write.format('csv').mode('ignore').save('/Volumes/ninad/default/ninad/data.csv')