In [None]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=52509689f1b0e3eef930b5704b37e023fde1952dcd2aba25c3c1d6cf698290f0
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Spark").getOrCreate()

In [None]:
food=spark.read.csv("/content/indian_food.csv",header=True)
food.show()

+--------------+--------------------+----------+---------+---------+--------------+-------+-------------+------+
|          name|         ingredients|      diet|prep_time|cook_time|flavor_profile| course|        state|region|
+--------------+--------------------+----------+---------+---------+--------------+-------+-------------+------+
|    Balu shahi|Maida flour, yogu...|vegetarian|       45|       25|         sweet|dessert|  West Bengal|  East|
|        Boondi|Gram flour, ghee,...|vegetarian|       80|       30|         sweet|dessert|    Rajasthan|  West|
|Gajar ka halwa|Carrots, milk, su...|vegetarian|       15|       60|         sweet|dessert|       Punjab| North|
|        Ghevar|Flour, ghee, kewr...|vegetarian|       15|       30|         sweet|dessert|    Rajasthan|  West|
|   Gulab jamun|Milk powder, plai...|vegetarian|       15|       40|         sweet|dessert|  West Bengal|  East|
|        Imarti|Sugar syrup, lent...|vegetarian|       10|       50|         sweet|dessert|  Wes

In [None]:
#Find out how many unique dishes are present
unique_dishes=food.select("name").count()
unique_dishes


255

In [None]:
#Which state has more dishes
state_dishes_count=food.groupBy("state").count().orderBy("count",ascending=False).first()[0]
state_dishes_count

'Gujarat'

In [None]:
#How many Dishes from state Karnataka
from pyspark.sql.functions import col
food.filter(col("state")=="Karnataka").count()

6

In [None]:
#List the number of unique regions
food=food.na.drop()
regions=food.select("region").distinct()
regions.show()

+----------+
|    region|
+----------+
|        -1|
|     South|
|   Central|
|      East|
|      West|
|North East|
|     North|
+----------+



In [None]:
#Count the number of dishes from each region
no_of_dishes_per_region=food.groupBy("region").count().orderBy("count")
no_of_dishes_per_region.show()

+----------+-----+
|    region|count|
+----------+-----+
|   Central|    3|
|        -1|   13|
|North East|   25|
|      East|   31|
|     North|   49|
|     South|   59|
|      West|   74|
+----------+-----+



In [None]:
#List unique flavour profile and course
from pyspark.sql.functions import col
distinct_course=food.select("course").distinct()
distinct_flavor=food.select("flavor_profile").distinct()
distinct_course.show()
distinct_flavor.show()

+-----------+
|     course|
+-----------+
|    starter|
|    dessert|
|      snack|
|main course|
+-----------+

+--------------+
|flavor_profile|
+--------------+
|            -1|
|         spicy|
|         sweet|
|          sour|
|        bitter|
+--------------+



In [None]:
#Which state has more main course
max_main_course=food.groupBy("state").count().orderBy("count",ascending=False).first()[0]
max_main_course

'Gujarat'

In [None]:
#Give % of dishes from each region
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
count_dishes_per_region=food.groupBy("region").count()
total_dishes=food.count()
def percentage(data,total_dishes):
  return (int(data)/int(total_dishes))*100
udf1=udf(percentage,IntegerType())
count_dishes_per_region.withColumn("Percentage",col("count")/total_dishes*100).orderBy("Percentage",ascending=False).drop("count").show()

+----------+------------------+
|    region|        Percentage|
+----------+------------------+
|      West|29.133858267716533|
|     South|23.228346456692915|
|     North|19.291338582677163|
|      East|12.204724409448819|
|North East|  9.84251968503937|
|        -1| 5.118110236220472|
|   Central|1.1811023622047243|
+----------+------------------+



In [None]:
#List state which has more dishes from each region
from pyspark.sql.window import Window
from pyspark.sql.functions import col,rank
g1=food.groupBy("state","region").count()
window_spec=Window.partitionBy("region").orderBy(col("count").desc())
ranked=g1.withColumn("Rank",rank().over(window_spec))
ranked2=ranked.filter(col("rank")==1)
ranked2.drop("rank").show()


+--------------+----------+-----+
|         state|    region|count|
+--------------+----------+-----+
|            -1|        -1|   13|
|Madhya Pradesh|   Central|    2|
|   West Bengal|      East|   24|
|        Punjab|     North|   32|
|         Assam|North East|   21|
|    Tamil Nadu|     South|   20|
|       Gujarat|      West|   35|
+--------------+----------+-----+

