In [None]:
# !pip install geopy
# from geopy.geocoders import Nominatim

from pyspark.sql.functions import *

In [None]:
# Storage account info
storage_account_name = "capstonestoragezomato"
storage_account_key = ""
container_name = "cleaned-data"

# Set up the configuration
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

In [None]:
blob_path_operationalAnalytics = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/operational_analytics"
zomato_delivery_operational_analytics_df = spark.read.parquet(blob_path_operationalAnalytics)
zomato_delivery_operational_analytics_df.show()

+------+-------------------+-------------------+-------------------+--------------------+---------------------------+-----------------------+-----------------+-------------------+--------------------------+--------------------+--------------------+-------------+-------------+----------------+------------------+----------------+----------+-------------------+------------------+--------+-------+--------+-----------+------------+------+
|    ID|  Time_Order_picked|        Time_Orderd|Restaurant_latitude|Restaurant_longitude|Delivery_location_longitude|Delivery_person_Ratings|Vehicle_condition|Delivery_person_Age|Delivery_location_latitude|Road_traffic_density|       unique_row_id|Type_of_order|         City|Time_taken (min)|Delivery_person_ID| Type_of_vehicle|Order_Date|multiple_deliveries|Weather_conditions|Festival|Year_ID|Month_ID|Day_of_Week|Week_of_Year|QTR_ID|
+------+-------------------+-------------------+-------------------+--------------------+---------------------------+-------

In [None]:
zomato_delivery_operational_analytics_df.createOrReplaceTempView('operationalAnalytics')

In [None]:
blob_path_metropolitanData = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/metropolitan_data"
zomato_dataset_metropolitan_df = spark.read.parquet(blob_path_metropolitanData)
zomato_dataset_metropolitan_df.show()

+---------------+--------------------+-------------+------------+--------------------+-----+------+------------+--------------+--------------------+----------+--------------------+-----------+
|Delivery_Rating|           Item_Name|Dining_Rating|     Cuisine|          Place_Name|Votes|Prices|Dining_Votes|Delivery_Votes|       unique_row_id|      City|     Restaurant_Name|Best_Seller|
+---------------+--------------------+-------------+------------+--------------------+-----+------+------------+--------------+--------------------+----------+--------------------+-----------+
|            3.7|   Cheese Corn Balls|          4.1|       Pizza|         Civil Lines|   15|199.99|         154|             0|2b4ea3c2-634b-414...|    Raipur|            Veggiies|       null|
|            3.9|     Lal Peda [1 kg]|          3.3|     Chinese|        Indira Nagar|    0| 540.0|          13|             0|b3f5b5a2-d8a9-425...|   Lucknow|Pawan Sweet & Bho...| BESTSELLER|
|            3.6|Veg Paneer Mayo T.

In [None]:
zomato_dataset_metropolitan_df.createOrReplaceTempView('metropolitanData')

In [None]:
%sql
-- Test area
-- SELECT DISTINCT City
-- FROM metropolitanData
-- ORDER BY City

In [None]:
%sql
SELECT COUNT(DISTINCT Item_Name) AS different_items
FROM metropolitanData

different_items
55687


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT COUNT(DISTINCT Restaurant_Name) AS different_restaurant_vendors
FROM metropolitanData

different_restaurant_vendors
826


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- TOP 10 cities with most restaurants
SELECT City, COUNT(Restaurant_Name) AS restaurant_count
FROM metropolitanData
GROUP BY City
ORDER BY restaurant_count DESC
LIMIT 10

City,restaurant_count
Hyderabad,15613
Jaipur,14438
Mumbai,13529
Chennai,13100
Bangalore,12040
Ahmedabad,10178
Kolkata,8662
Pune,8067
Kochi,7759
Raipur,7700


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- Top 10 Cities with different restaurant franchises
SELECT COUNT(DISTINCT Restaurant_Name) AS franchise_count, City
FROM metropolitanData
GROUP BY City
ORDER BY franchise_count DESC
LIMIT 10

franchise_count,City
129,Hyderabad
92,Jaipur
87,Chennai
81,Bangalore
80,Mumbai
74,Ahmedabad
72,Kochi
64,Kolkata
60,Pune
57,Lucknow


In [None]:
%sql

-- top 10 cities with cheapest average prices
SELECT City, ROUND(AVG(Prices), 2) AS avg_price
FROM metropolitanData
GROUP BY City
ORDER BY avg_price ASC
LIMIT 10

City,avg_price
Raipur,194.59
Malleshwaram,196.86
Jaipur,217.16
Goa,222.99
Ahmedabad,223.68
Kochi,225.95
Magrath Road,226.77
Bangalore,233.42
New Delhi,235.35
Lucknow,235.55


Databricks visualization. Run in Databricks to view.

In [None]:
# %sql

# -- Restaurant_Name ordered by Dining_Votes and Dining_Rating 
# SELECT Restaurant_Name, AVG(Dining_Votes) avg_Dining_Votes, ROUND(AVG(Dining_Rating), 3) AS avg_Dining_Rating, COUNT(*)
# FROM metropolitanData
# WHERE Dining_Votes>0 AND ROUND(Dining_Rating, 3)>=4.5
# GROUP BY Restaurant_Name
# ORDER BY avg_Dining_Votes DESC

In [None]:
%sql

-- TOP 3 cheapest restaurants in each city
SELECT Restaurant_Name, City, avg_price
FROM (
    SELECT *, DENSE_RANK() OVER (PARTITION BY City ORDER BY avg_price ASC) AS rank
    FROM (
        SELECT Restaurant_Name, City, 
              ROUND(AVG(Prices), 2) AS avg_price
              -- ROUND(AVG(Delivery_Rating), 2) AS avg_Delivery_Rating, 
              -- ROUND(AVG(Dining_Rating), 2) AS avg_Dining_Rating, 
              -- ROUND(AVG(Delivery_Votes), 2) AS avg_Delivery_Votes,
              -- ROUND(AVG(Dining_Votes), 2) AS avg_Dining_Votes 
        FROM metropolitanData
        GROUP BY Restaurant_Name, City
    ) AS _
) AS _
WHERE rank <= 3

Restaurant_Name,City,avg_price
Bole To Vadapav,Ahmedabad,40.85
Karnavati Dabeli,Ahmedabad,67.99
Jay Bhavani Vadapav,Ahmedabad,75.0
GOPIZZA,Banaswadi,335.89
Iyer Mess,Bangalore,56.57
Sandwich Guru,Bangalore,67.98
Puliyogare Point,Bangalore,69.02
Ibrahim Biriyani,Chennai,110.0
Tower Burger,Chennai,110.21
The Burger Cafe,Chennai,114.52


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- Top 3 restaurants in each city with different food choices
SELECT Restaurant_Name, City, unique_items
FROM (
    SELECT *, ROW_NUMBER() OVER (PARTITION BY City ORDER BY unique_items DESC) AS rank
    FROM (
        SELECT Restaurant_Name, City, COUNT(DISTINCT Item_Name) AS unique_items
        FROM metropolitanData
        GROUP BY Restaurant_Name, City
    ) AS _
) AS _
WHERE rank <= 3 

Restaurant_Name,City,unique_items
Prithvi Hotel,Ahmedabad,316
P. Bhagat Tarachand,Ahmedabad,278
Fayrouz,Ahmedabad,278
GOPIZZA,Banaswadi,85
FreshMenu,Bangalore,339
Roti Ghar,Bangalore,281
Beijing Bites,Bangalore,248
Savoury Sea Shell,Chennai,558
Liza Restaurant,Chennai,369
Babal Da Punjabi Dabha,Chennai,357


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- most popular item in each city
SELECT *
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY City ORDER BY count DESC) AS rank
  FROM (
    SELECT City, Item_Name, COUNT(*) AS count
    FROM metropolitanData
    GROUP BY City, Item_Name
  ) AS _
) AS _
WHERE rank=1

City,Item_Name,count,rank
Ahmedabad,Jeera Rice,31,1
Banaswadi,Veggie Finger Strips,2,1
Bangalore,Chicken Fried Rice,37,1
Chennai,Chicken Fried Rice,52,1
Goa,Chicken Biryani,17,1
Hyderabad,Chicken Fried Rice,74,1
Jaipur,Honey Chilli Potato,41,1
Kochi,Chocolate Shake,25,1
Kolkata,Chicken Biryani,26,1
Lucknow,Paneer Butter Masala,23,1


Databricks visualization. Run in Databricks to view.

In [None]:
# # geocoder object
# geolocator = Nominatim(user_agent="app")

# # latitude and longitude
# latitude = 10.96185
# longitude =  76.971082

# # reverse geocoding
# location = geolocator.reverse((latitude, longitude))

# print(location.address)

In [None]:
# %sql

# -- average delivery time and average delay for pickup per 
# SELECT Restaurant_latitude, Restaurant_longitude, ROUND(AVG(`Time_taken (min)`), 3) AS avg_Time_taken
# FROM operationalAnalytics
# WHERE `Time_taken (min)` IS NOT NULL
# GROUP BY Restaurant_latitude, Restaurant_longitude

In [None]:
%sql

-- average time taken for different order types
SELECT Type_of_order, ROUND(AVG(`Time_taken (min)`), 3) AS avg_time_taken
FROM operationalAnalytics
GROUP BY Type_of_order
ORDER BY avg_time_taken ASC

Type_of_order,avg_time_taken
Drinks,27.379
Buffet,27.406
Snack,27.547
Meal,27.639


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- average time taken for different road traffic conditions
SELECT Road_traffic_density, 
       ROUND(AVG(TIMESTAMPDIFF(MINUTE, Time_Orderd, Time_Order_picked)), 3) AS avg_pickup_time, 
       ROUND(AVG(`Time_taken (min)`), 3) AS avg_time_taken
      --  ROUND(AVG(`Time_taken (min)`) + AVG(TIMESTAMPDIFF(MINUTE, Time_Orderd, Time_Order_picked)), 3) AS avg_time_taken
FROM operationalAnalytics
WHERE `Time_taken (min)` IS NOT NULL
GROUP BY Road_traffic_density
ORDER BY avg_time_taken ASC

Road_traffic_density,avg_pickup_time,avg_time_taken
Low,9.888,22.106
Medium,10.015,27.694
High,9.938,28.103
Jam,9.961,31.937


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- daily orders 
SELECT Month_ID, Day_of_Week, COUNT(ID) AS order_count,
       CASE
           WHEN Day_of_Week=1 THEN 'Sunday'
           WHEN Day_of_Week=2 THEN 'Monday'
           WHEN Day_of_Week=3 THEN 'Tuesday'
           WHEN Day_of_Week=4 THEN 'Wednesday'
           WHEN Day_of_Week=5 THEN 'Thursday'
           WHEN Day_of_Week=6 THEN 'Friday'
           WHEN Day_of_Week=7 THEN 'Saturday'
       END AS day_name
FROM operationalAnalytics
GROUP BY Month_ID, Day_of_Week
ORDER BY Month_ID, Day_of_Week

Month_ID,Day_of_Week,order_count,day_name
2,1,651,Sunday
2,2,603,Monday
2,3,628,Tuesday
2,4,610,Wednesday
2,5,627,Thursday
2,6,1290,Friday
2,7,596,Saturday
3,1,2783,Sunday
3,2,2913,Monday
3,3,2890,Tuesday


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- relation between weather condition and average time taken to fulfil an order
SELECT Weather_conditions, ROUND(AVG(`Time_taken (min)`) + AVG(TIMESTAMPDIFF(MINUTE, Time_Orderd, Time_Order_picked)), 3) AS avg_time_taken, COUNT(*) AS num_orders
FROM operationalAnalytics
GROUP BY Weather_conditions
ORDER BY avg_time_taken ASC

Weather_conditions,avg_time_taken,num_orders
Sunny,32.712,4718
Stormy,36.87,5236
Sandstorms,37.077,5119
Windy,37.097,5222
Cloudy,40.114,5326
Fog,40.153,5467


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- distribution of vehicle types
SELECT Type_of_Vehicle, COUNT(ID) AS count
FROM operationalAnalytics
GROUP BY Type_of_Vehicle

Type_of_Vehicle,count
motorcycle,18462
scooter,10136
electric_scooter,2490


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- age of delivery person vs time to fulfill order
SELECT Delivery_person_Age, ROUND(AVG(`Time_taken (min)`) + AVG(TIMESTAMPDIFF(MINUTE, Time_Orderd, Time_Order_picked)), 3) AS avg_time_taken, COUNT(ID) AS num_orders
FROM operationalAnalytics
GROUP BY Delivery_person_Age
ORDER BY Delivery_person_Age ASC

Delivery_person_Age,avg_time_taken,num_orders
20,34.389,1473
21,34.307,1470
22,34.26,1509
23,34.457,1436
24,34.681,1474
25,34.235,1457
26,34.084,1473
27,34.222,1435
28,34.408,1492
29,34.428,1471


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

-- distribution of orders placed throughout the day
SELECT HOUR(Time_Orderd) AS Order_Hour, COUNT(ID) AS order_count, ROUND(AVG(`Time_taken (min)`) + AVG(TIMESTAMPDIFF(MINUTE, Time_Orderd, Time_Order_picked)), 3) AS avg_time_taken
FROM operationalAnalytics
GROUP BY Order_Hour
ORDER BY Order_Hour

Order_Hour,order_count,avg_time_taken
8,1217,30.503
9,1255,30.134
10,1437,30.141
11,1303,38.104
12,678,37.794
13,635,38.257
14,537,38.147
15,588,34.376
16,544,33.831
17,3153,38.528


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT DISTINCT Item_Name, COUNT(*) AS count
FROM metropolitanData
WHERE Dining_Votes>0 AND Dining_Rating>0 AND Delivery_Rating>0 AND Delivery_Votes>0
GROUP BY Item_Name
ORDER BY count DESC
LIMIT 100

Item_Name,count
Veg Fried Rice,50
Chicken Fried Rice,42
Margherita Pizza,38
Paneer Butter Masala,31
Cold Coffee,30
French Fries,29
Jeera Rice,28
Egg Fried Rice,28
Veg Biryani,25
Strawberry Shake,24


Databricks visualization. Run in Databricks to view.

In [None]:
%sql

SELECT Restaurant_latitude, Restaurant_longitude, COUNT(*) AS count
FROM operationalAnalytics
WHERE Restaurant_latitude>0
GROUP BY Restaurant_latitude, Restaurant_longitude

Restaurant_latitude,Restaurant_longitude,count
12.284747,76.625861,109
22.307898,73.167788,98
30.361281,78.068022,30
27.161661,78.011544,31
30.885814,75.786976,23
26.47,80.35,24
30.890184,75.829615,25
26.90294,75.793007,119
11.001852,76.976268,107
25.450317,81.831681,20


Databricks visualization. Run in Databricks to view.