In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark



In [37]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Indian Food').getOrCreate()

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [None]:
df = spark.read.csv('/content/drive/MyDrive/BDA_1_Sem/ABD/ABD_Lab/Hadoop/Pyspark/Spark2/indian_food.csv', header = True, inferSchema = True)
df.show()

+--------------+--------------------+----------+---------+---------+--------------+-------+-------------+
|          name|         ingredients|      diet|prep_time|cook_time|flavor_profile| course|        state|
+--------------+--------------------+----------+---------+---------+--------------+-------+-------------+
|    Balu shahi|Maida flour, yogu...|vegetarian|       45|       25|         sweet|dessert|  West Bengal|
|        Boondi|Gram flour, ghee,...|vegetarian|       80|       30|         sweet|dessert|    Rajasthan|
|Gajar ka halwa|Carrots, milk, su...|vegetarian|       15|       60|         sweet|dessert|       Punjab|
|        Ghevar|Flour, ghee, kewr...|vegetarian|       15|       30|         sweet|dessert|    Rajasthan|
|   Gulab jamun|Milk powder, plai...|vegetarian|       15|       40|         sweet|dessert|  West Bengal|
|        Imarti|Sugar syrup, lent...|vegetarian|       10|       50|         sweet|dessert|  West Bengal|
|        Jalebi|Maida, corn flour...|vegetaria

In [None]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- ingredients: string (nullable = true)
 |-- diet: string (nullable = true)
 |-- prep_time: integer (nullable = true)
 |-- cook_time: integer (nullable = true)
 |-- flavor_profile: string (nullable = true)
 |-- course: string (nullable = true)
 |-- state: string (nullable = true)



Q1) Find out how many unique dishes are present.

In [None]:
df1 = df.select('name').distinct()
df1.show()

+-----------------+
|             name|
+-----------------+
|            Kheer|
|          Poriyal|
|Sabudana Khichadi|
|   Gajar ka halwa|
|     Keerai sadam|
|         Dalithoy|
|         Mihidana|
|          Singori|
|          Uttapam|
|          Gavvalu|
|   Kakinada khaja|
|   Chak Hao Kheer|
|          Pachadi|
|         Vindaloo|
|     Gheela Pitha|
|         Idiappam|
|     Keerai kootu|
|             Idli|
|            Saath|
|            Rabri|
+-----------------+
only showing top 20 rows



In [None]:
df1.count()

255

Q2) Which state has more dishes

In [None]:
df2 = df.groupBy('state').count().orderBy(desc('count'))
df2.show()

+---------------+-----+
|          state|count|
+---------------+-----+
|        Gujarat|   35|
|         Punjab|   32|
|    Maharashtra|   30|
|             -1|   24|
|    West Bengal|   24|
|          Assam|   21|
|     Tamil Nadu|   20|
| Andhra Pradesh|   10|
|  Uttar Pradesh|    9|
|         Kerala|    8|
|         Odisha|    7|
|      Karnataka|    6|
|      Rajasthan|    6|
|      Telangana|    5|
|            Goa|    3|
|          Bihar|    3|
| Madhya Pradesh|    2|
|        Manipur|    2|
|Jammu & Kashmir|    2|
|       Nagaland|    1|
+---------------+-----+
only showing top 20 rows



Q3) How many dishes from state Karnataka?

In [None]:
df3 = df.filter(df['state'] == 'Karnataka')
df3.count()

6

In [None]:
df3.show()

+--------------+--------------------+----------+---------+---------+--------------+-----------+---------+
|          name|         ingredients|      diet|prep_time|cook_time|flavor_profile|     course|    state|
+--------------+--------------------+----------+---------+---------+--------------+-----------+---------+
| Dharwad pedha|Milk, Sugar, Dhar...|vegetarian|       20|       60|         sweet|    dessert|Karnataka|
|    Mysore pak|Besan flour, semo...|vegetarian|        5|       20|         sweet|    dessert|Karnataka|
|Obbattu holige|Maida flour, turm...|vegetarian|      180|       60|         sweet|main course|Karnataka|
|Bisi bele bath|Split pigeon peas...|vegetarian|       30|       45|         spicy|main course|Karnataka|
|     Koshambri|Moong dal, cucumb...|vegetarian|       10|       20|         spicy|main course|Karnataka|
|       Sandige|Thin rice flakes,...|vegetarian|      120|       60|            -1|main course|Karnataka|
+--------------+--------------------+---------

Q4) List number of unique regions

In [None]:
south_states = ["Andhra Pradesh", "Karnataka", "Kerala", "Tamil Nadu", "Telangana"]
north_states = ["Delhi", "Haryana", "Himachal Pradesh", "Jammu & Kashmir", "Punjab", "Rajasthan", "Uttar Pradesh", "Uttarakhand"]
east_states = ["Bihar", "Jharkhand", "Odisha", "West Bengal"]
west_states = ["Gujarat", "Maharashtra", "Goa", "Madhya Pradesh", "Chhattisgarh"]

df = df.withColumn("region",
                   when(df["state"].isin(south_states), lit("South"))
                   .when(df["state"].isin(north_states), lit("North"))
                   .when(df["state"].isin(east_states), lit("East"))
                   .when(df["state"].isin(west_states), lit("West"))
                   .otherwise(lit("-1")))

In [None]:
df4 = df.select('region').distinct()
df4.show()

+------+
|region|
+------+
|    -1|
| South|
|  East|
|  West|
| North|
+------+



Q5) Count number of dishes from each region.

In [None]:
df5 = df.groupBy('region').count().orderBy(desc('count'))
df5.show()

+------+-----+
|region|count|
+------+-----+
|  West|   71|
| North|   51|
|    -1|   50|
| South|   49|
|  East|   34|
+------+-----+



Q6) List unique 'flavor_profile' and 'course'

In [None]:
df6 = df.select('flavor_profile').distinct()
df6.show()

+--------------+
|flavor_profile|
+--------------+
|            -1|
|         spicy|
|         sweet|
|          sour|
|        bitter|
+--------------+



Q7) Which state has more 'main course'

In [None]:
df7 = df.filter(df['course'] == 'main course').groupBy('state').count().orderBy(desc('count'))
df7.show()

+---------------+-----+
|          state|count|
+---------------+-----+
|         Punjab|   28|
|     Tamil Nadu|   17|
|          Assam|   15|
|        Gujarat|   12|
|    Maharashtra|   12|
|             -1|    9|
|    West Bengal|    9|
|         Kerala|    5|
|      Karnataka|    4|
|      Rajasthan|    3|
|  Uttar Pradesh|    3|
|          Bihar|    2|
|       Nagaland|    1|
|         Odisha|    1|
| Madhya Pradesh|    1|
|        Manipur|    1|
|Jammu & Kashmir|    1|
|            Goa|    1|
|        Haryana|    1|
|   NCT of Delhi|    1|
+---------------+-----+
only showing top 20 rows



Q8) Give the %of dishes from each region.

In [None]:
df8 = df.groupBy('region').count()
df8.show()

+------+-----+
|region|count|
+------+-----+
|    -1|   50|
| South|   49|
|  East|   34|
|  West|   71|
| North|   51|
+------+-----+



In [None]:
df8 = df.groupBy('region').count()
total_count = df8.agg(sum('count')).collect()[0][0]
df8_with_percentage = df8.withColumn('percentage', (df8['count'] / total_count) * 100)
df8_with_percentage.show()

+------+-----+------------------+
|region|count|        percentage|
+------+-----+------------------+
|    -1|   50|19.607843137254903|
| South|   49|19.215686274509807|
|  East|   34|13.333333333333334|
|  West|   71| 27.84313725490196|
| North|   51|              20.0|
+------+-----+------------------+



Q9) List the states which has more dishes from each region.

In [None]:
df9 = df.groupBy('state', 'region').count().orderBy(desc('count'))
df9.show()

+---------------+------+-----+
|          state|region|count|
+---------------+------+-----+
|        Gujarat|  West|   35|
|         Punjab| North|   32|
|    Maharashtra|  West|   30|
|    West Bengal|  East|   24|
|             -1|    -1|   24|
|          Assam|    -1|   21|
|     Tamil Nadu| South|   20|
| Andhra Pradesh| South|   10|
|  Uttar Pradesh| North|    9|
|         Kerala| South|    8|
|         Odisha|  East|    7|
|      Karnataka| South|    6|
|      Rajasthan| North|    6|
|      Telangana| South|    5|
|          Bihar|  East|    3|
|            Goa|  West|    3|
| Madhya Pradesh|  West|    2|
|Jammu & Kashmir| North|    2|
|        Manipur|    -1|    2|
|   Chhattisgarh|  West|    1|
+---------------+------+-----+
only showing top 20 rows



In [None]:
df9 = df.groupBy('state', 'region').count()
window_spec = Window.partitionBy('region').orderBy(desc('count'))
df9_with_rank = df9.withColumn('rank', row_number().over(window_spec))
top_states = df9_with_rank.filter(df9_with_rank.rank <= 3)
top_states.select('state', 'region', 'count').show()

+--------------+------+-----+
|         state|region|count|
+--------------+------+-----+
|            -1|    -1|   24|
|         Assam|    -1|   21|
|       Manipur|    -1|    2|
|   West Bengal|  East|   24|
|        Odisha|  East|    7|
|         Bihar|  East|    3|
|        Punjab| North|   32|
| Uttar Pradesh| North|    9|
|     Rajasthan| North|    6|
|    Tamil Nadu| South|   20|
|Andhra Pradesh| South|   10|
|        Kerala| South|    8|
|       Gujarat|  West|   35|
|   Maharashtra|  West|   30|
|           Goa|  West|    3|
+--------------+------+-----+

