In [1]:
!pip install folium

Collecting folium
  Downloading folium-0.19.5-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.8.1-py3-none-any.whl.metadata (1.5 kB)
Downloading folium-0.19.5-py2.py3-none-any.whl (110 kB)
Downloading branca-0.8.1-py3-none-any.whl (26 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.8.1 folium-0.19.5


In [2]:
!pip install IPython



In [5]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1


# 1. Load Python Module

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from IPython.display import display, HTML
import time
import folium
import plotly.express as px

# 2. Read the Zomato Dataset from CSV file - Using Pandas

In [122]:
#importing Dataset
df1 = pd.read_csv("zomato_data.csv")
df1

Unnamed: 0,online_order,book_table,rate,votes,rest_type,dish_liked,cuisines,approx_costfor_two_people,listed_intype,listed_incity
0,Yes,Yes,4.1/5,775,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,Buffet,Banashankari
1,Yes,No,4.1/5,787,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,Buffet,Banashankari
2,Yes,No,3.8/5,918,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,Buffet,Banashankari
3,No,No,3.7/5,88,Quick Bites,Masala Dosa,"South Indian, North Indian",300,Buffet,Banashankari
4,No,No,3.8/5,166,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,Buffet,Banashankari
...,...,...,...,...,...,...,...,...,...,...
51712,No,No,3.6 /5,27,Bar,,Continental,1500,Pubs and bars,Whitefield
51713,No,No,,0,Bar,,Finger Food,600,Pubs and bars,Whitefield
51714,No,No,,0,Bar,,Finger Food,2000,Pubs and bars,Whitefield
51715,No,Yes,4.3 /5,236,Bar,"Cocktails, Pizza, Buttermilk",Finger Food,2500,Pubs and bars,Whitefield


# 3. Basic Inception on Dataset

In [123]:
print("top 5 rows using head")
print(df1.head())
print()
print("bottom 5 rows using tail")
print(df1.tail())
print()
print("numbers of samples and columns")
print(df1.shape)
print()
print("Column Names")
print(df1.columns)
print()
print("DataFrame Info")
print(df1.info())
print()
print("Check the missing value in each column")
print(df1.isnull().sum())
print()

top 5 rows using head
  online_order book_table   rate  votes            rest_type  \
0          Yes        Yes  4.1/5    775        Casual Dining   
1          Yes         No  4.1/5    787        Casual Dining   
2          Yes         No  3.8/5    918  Cafe, Casual Dining   
3           No         No  3.7/5     88          Quick Bites   
4           No         No  3.8/5    166        Casual Dining   

                                          dish_liked  \
0  Pasta, Lunch Buffet, Masala Papad, Paneer Laja...   
1  Momos, Lunch Buffet, Chocolate Nirvana, Thai G...   
2  Churros, Cannelloni, Minestrone Soup, Hot Choc...   
3                                        Masala Dosa   
4                                Panipuri, Gol Gappe   

                         cuisines approx_costfor_two_people listed_intype  \
0  North Indian, Mughlai, Chinese                       800        Buffet   
1     Chinese, North Indian, Thai                       800        Buffet   
2          Cafe, Mexican,

# 4. Data Cleaning and Preprocessing

## 4.1. Fill the missing values using the Median Rating

In [124]:
df1['rate'] = df1['rate'].replace("-",np.nan)

In [125]:
df1['rate'] = df1['rate'].str.replace("/5","",regex=False)

In [126]:
df1['rate'] = pd.to_numeric(df1['rate'],errors="coerce")

In [127]:
median_rating = df1['rate'].median()
df1['rate'] = df1['rate'].fillna(median_rating)

In [128]:
df1.info()
df1.isnull().sum()
df1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   online_order               51717 non-null  object 
 1   book_table                 51717 non-null  object 
 2   rate                       51717 non-null  float64
 3   votes                      51717 non-null  int64  
 4   rest_type                  51490 non-null  object 
 5   dish_liked                 23639 non-null  object 
 6   cuisines                   51672 non-null  object 
 7   approx_costfor_two_people  51371 non-null  object 
 8   listed_intype              51717 non-null  object 
 9   listed_incity              51717 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.9+ MB


Unnamed: 0,rate,votes
count,51717.0,51717.0
mean,3.700362,283.697527
std,0.395391,803.838853
min,1.8,0.0
25%,3.5,7.0
50%,3.7,41.0
75%,3.9,198.0
max,4.9,16832.0


## 4.2. Fill the missing values using the Median Cost

In [129]:
df1['approx_costfor_two_people'] = df1['approx_costfor_two_people'].str.replace(',', '', regex=False)

In [130]:
df1['approx_costfor_two_people'] = pd.to_numeric(df1['approx_costfor_two_people'], errors='coerce')

In [131]:
median_cost = df1['approx_costfor_two_people'].median()

In [132]:
df1['approx_costfor_two_people'] = df1['approx_costfor_two_people'].fillna(median_cost)

In [133]:
df1.info()
df1.isnull().sum()
df1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   online_order               51717 non-null  object 
 1   book_table                 51717 non-null  object 
 2   rate                       51717 non-null  float64
 3   votes                      51717 non-null  int64  
 4   rest_type                  51490 non-null  object 
 5   dish_liked                 23639 non-null  object 
 6   cuisines                   51672 non-null  object 
 7   approx_costfor_two_people  51717 non-null  float64
 8   listed_intype              51717 non-null  object 
 9   listed_incity              51717 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 3.9+ MB


Unnamed: 0,rate,votes,approx_costfor_two_people
count,51717.0,51717.0,51717.0
mean,3.700362,283.697527,554.391689
std,0.395391,803.838853,437.563723
min,1.8,0.0,40.0
25%,3.5,7.0,300.0
50%,3.7,41.0,400.0
75%,3.9,198.0,650.0
max,4.9,16832.0,6000.0


## 4.3. Replacing NaN Values

In [134]:
df1['dish_liked'] = df['dish_liked'].fillna('Not Available')

In [135]:
df1['cuisines'] = df['cuisines'].fillna('Other')

In [136]:
df1['rest_type'] = df['rest_type'].fillna('Unknown')

In [137]:
df1['dish_liked'].unique()

array(['Pasta, Lunch Buffet, Masala Papad, Paneer Lajawab, Tomato Shorba, Dum Biryani, Sweet Corn Soup',
       'Momos, Lunch Buffet, Chocolate Nirvana, Thai Green Curry, Paneer Tikka, Dum Biryani, Chicken Biryani',
       'Churros, Cannelloni, Minestrone Soup, Hot Chocolate, Pink Sauce Pasta, Salsa, Veg Supreme Pizza',
       ...,
       'Noodles, Chicken Noodle, Momos, American Chopsuey, Salad, Manchow Soup, Manchurian',
       'Chicken Quesadilla, Naan, Breakfast Buffet, Cheesecake, Cocktails, Lunch Buffet, Biryani',
       'Biryani, Andhra Meal'], dtype=object)

In [138]:
df1['cuisines'].unique()

array(['North Indian, Mughlai, Chinese', 'Chinese, North Indian, Thai',
       'Cafe, Mexican, Italian', ...,
       'North Indian, Street Food, Biryani', 'Chinese, Mughlai',
       'North Indian, Chinese, Arabian, Momos'], dtype=object)

In [139]:
df1['rest_type'].unique()

array(['Casual Dining', 'Cafe, Casual Dining', 'Quick Bites',
       'Casual Dining, Cafe', 'Cafe', 'Quick Bites, Cafe',
       'Cafe, Quick Bites', 'Delivery', 'Mess', 'Dessert Parlor',
       'Bakery, Dessert Parlor', 'Pub', 'Bakery', 'Takeaway, Delivery',
       'Fine Dining', 'Beverage Shop', 'Sweet Shop', 'Bar',
       'Beverage Shop, Quick Bites', 'Confectionery',
       'Quick Bites, Beverage Shop', 'Dessert Parlor, Sweet Shop',
       'Bakery, Quick Bites', 'Sweet Shop, Quick Bites', 'Kiosk',
       'Food Truck', 'Quick Bites, Dessert Parlor',
       'Beverage Shop, Dessert Parlor', 'Takeaway', 'Pub, Casual Dining',
       'Casual Dining, Bar', 'Dessert Parlor, Beverage Shop',
       'Quick Bites, Bakery', 'Dessert Parlor, Quick Bites',
       'Microbrewery, Casual Dining', 'Lounge', 'Bar, Casual Dining',
       'Food Court', 'Cafe, Bakery', 'Unknown', 'Dhaba',
       'Quick Bites, Sweet Shop', 'Microbrewery',
       'Food Court, Quick Bites', 'Pub, Bar', 'Casual Dining, Pub',


In [140]:
df1.info()
df1.isnull().sum()
df1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   online_order               51717 non-null  object 
 1   book_table                 51717 non-null  object 
 2   rate                       51717 non-null  float64
 3   votes                      51717 non-null  int64  
 4   rest_type                  51717 non-null  object 
 5   dish_liked                 51717 non-null  object 
 6   cuisines                   51717 non-null  object 
 7   approx_costfor_two_people  51717 non-null  float64
 8   listed_intype              51717 non-null  object 
 9   listed_incity              51717 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 3.9+ MB


Unnamed: 0,rate,votes,approx_costfor_two_people
count,51717.0,51717.0,51717.0
mean,3.700362,283.697527,554.391689
std,0.395391,803.838853,437.563723
min,1.8,0.0,40.0
25%,3.5,7.0,300.0
50%,3.7,41.0,400.0
75%,3.9,198.0,650.0
max,4.9,16832.0,6000.0


## 4.4. Fill missing values in Votes with Median

In [142]:
median_votes = df1['votes'].median()
df1['votes'] = df1['votes'].fillna(median_votes)

## 4.5. Binary Encoding

In [143]:
print("Unique values in 'online_order':", df1['online_order'].unique())
print("Unique values in 'book_table':", df1['book_table'].unique())


Unique values in 'online_order': ['Yes' 'No']
Unique values in 'book_table': ['Yes' 'No']


In [144]:
df1['online_order'] = df1['online_order'].map({'Yes': 1, 'No': 0})
df1['book_table'] = df1['book_table'].map({'Yes': 1, 'No': 0})

print(df1[['online_order', 'book_table']].head())

   online_order  book_table
0             1           1
1             1           0
2             1           0
3             0           0
4             0           0


In [145]:
df1.info()
df1.isnull().sum()
df1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   online_order               51717 non-null  int64  
 1   book_table                 51717 non-null  int64  
 2   rate                       51717 non-null  float64
 3   votes                      51717 non-null  int64  
 4   rest_type                  51717 non-null  object 
 5   dish_liked                 51717 non-null  object 
 6   cuisines                   51717 non-null  object 
 7   approx_costfor_two_people  51717 non-null  float64
 8   listed_intype              51717 non-null  object 
 9   listed_incity              51717 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 3.9+ MB


Unnamed: 0,online_order,book_table,rate,votes,approx_costfor_two_people
count,51717.0,51717.0,51717.0,51717.0,51717.0
mean,0.588665,0.124698,3.700362,283.697527,554.391689
std,0.49208,0.330379,0.395391,803.838853,437.563723
min,0.0,0.0,1.8,0.0,40.0
25%,0.0,0.0,3.5,7.0,300.0
50%,1.0,0.0,3.7,41.0,400.0
75%,1.0,0.0,3.9,198.0,650.0
max,1.0,1.0,4.9,16832.0,6000.0


## 4.6. Data Type Conversion

In [147]:
# Clean and convert 'rate' to float
df1['rate'] = df1['rate'].astype(str).str.replace('/5', '').str.strip()
df1['rate'] = df1['rate'].replace({'NEW': None, '-': None})
df1['rate'] = df1['rate'].astype(float)

# Clean and convert 'votes' to integer (nullable Int64)
df1['votes'] = df1['votes'].astype(str).str.replace(',', '')
df1['votes'] = pd.to_numeric(df1['votes'], errors='coerce').astype('Int64')

# Find and clean the cost column (with any formatting issues)
cost_col = [col for col in df1.columns if 'cost' in col.lower()][0]
df1[cost_col] = df1[cost_col].astype(str).str.replace(',', '')
df1[cost_col] = pd.to_numeric(df1[cost_col], errors='coerce').astype('Int64')

# Check types and preview
print(df1.dtypes[['rate', 'votes', cost_col]])
print(df1[['rate', 'votes', cost_col]].head())


rate                         float64
votes                          Int64
approx_costfor_two_people      Int64
dtype: object
   rate  votes  approx_costfor_two_people
0   4.1    775                        800
1   4.1    787                        800
2   3.8    918                        800
3   3.7     88                        300
4   3.8    166                        600


In [148]:
df1.info()
df1.isnull().sum()
df1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   online_order               51717 non-null  int64  
 1   book_table                 51717 non-null  int64  
 2   rate                       51717 non-null  float64
 3   votes                      51717 non-null  Int64  
 4   rest_type                  51717 non-null  object 
 5   dish_liked                 51717 non-null  object 
 6   cuisines                   51717 non-null  object 
 7   approx_costfor_two_people  51717 non-null  Int64  
 8   listed_intype              51717 non-null  object 
 9   listed_incity              51717 non-null  object 
dtypes: Int64(2), float64(1), int64(2), object(5)
memory usage: 4.0+ MB


Unnamed: 0,online_order,book_table,rate,votes,approx_costfor_two_people
count,51717.0,51717.0,51717.0,51717.0,51717.0
mean,0.588665,0.124698,3.700362,283.697527,554.391689
std,0.49208,0.330379,0.395391,803.838853,437.563723
min,0.0,0.0,1.8,0.0,40.0
25%,0.0,0.0,3.5,7.0,300.0
50%,1.0,0.0,3.7,41.0,400.0
75%,1.0,0.0,3.9,198.0,650.0
max,1.0,1.0,4.9,16832.0,6000.0


# 5. Read the Geographical Coordinates Dataset from CSV file - Using Pandas

In [98]:
df2 = pd.read_csv("Geographical Coordinates.csv")
df2

Unnamed: 0,listed_incity,Latitude,Longitude
0,Banashankari,12.939333,77.553982
1,Bannerghatta Road,12.95266,77.605048
2,Basavanagudi,12.941726,77.575502
3,Bellandur,12.925352,77.675941
4,Brigade Road,12.967358,77.606435
5,Brookefield,12.963814,77.722437
6,BTM,12.91636,77.604733
7,Church Street,12.974914,77.605247
8,Electronic City,12.84876,77.648253
9,Frazer Town,12.998683,77.615525


# 6. Basic Inception of the Data

In [85]:
print("top 5 rows using head")
print(df2.head())
print()
print("bottom 5 rows using tail")
print(df2.tail())
print()
print("numbers of samples and columns")
print(df2.shape)
print()
print("Column Names")
print(df2.columns)
print()
print("DataFrame Info")
print(df2.info())
print()
print("Check the missing value in each column")
print(df2.isnull().sum())
print()

top 5 rows using head
       listed_incity   Latitude  Longitude
0       Banashankari  12.939333  77.553982
1  Bannerghatta Road  12.952660  77.605048
2       Basavanagudi  12.941726  77.575502
3          Bellandur  12.925352  77.675941
4       Brigade Road  12.967358  77.606435

bottom 5 rows using tail
       listed_incity   Latitude  Longitude
21      Malleshwaram  13.002735  77.570325
22      Marathahalli  12.955257  77.698416
23           MG Road  12.975526  77.606790
24      New BEL Road  13.039186  77.564284
25  Old Airport Road  12.960632  77.642500

numbers of samples and columns
(26, 3)

Column Names
Index(['listed_incity', 'Latitude', 'Longitude'], dtype='object')

DataFrame Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   listed_incity  26 non-null     object 
 1   Latitude       26 non-null     float64
 2   Longitude      26

In [87]:
df2.head()

Unnamed: 0,listed_incity,Latitude,Longitude
0,Banashankari,12.939333,77.553982
1,Bannerghatta Road,12.95266,77.605048
2,Basavanagudi,12.941726,77.575502
3,Bellandur,12.925352,77.675941
4,Brigade Road,12.967358,77.606435


# 7 Dataset Merging for Mapping

In [149]:
# Standardize merge key column in both datasets
df1['listed_incity'] = df1['listed_incity'].astype(str).str.strip().str.title()
df2['listed_incity'] = df2['listed_incity'].astype(str).str.strip().str.title()

# Merge on 'listed_incity'
merged_df = pd.merge(df1, df2[['listed_incity', 'Latitude', 'Longitude']], on='listed_incity', how='left')

# Confirm merge result
print(merged_df[['listed_incity', 'Latitude', 'Longitude']].head())


  listed_incity   Latitude  Longitude
0  Banashankari  12.939333  77.553982
1  Banashankari  12.939333  77.553982
2  Banashankari  12.939333  77.553982
3  Banashankari  12.939333  77.553982
4  Banashankari  12.939333  77.553982


# 8. Restaurant Density Map

In [152]:
import folium
from folium.plugins import MarkerCluster
from IPython.display import IFrame

# Step 1: Drop rows with missing coordinates
restaurant_df = merged_df.dropna(subset=['Latitude', 'Longitude'])

# Step 2: Create the base map centered on Bangalore
density_map = folium.Map(location=[12.9716, 77.5946], zoom_start=12)

# Step 3: Add marker cluster to the map
marker_cluster = MarkerCluster().add_to(density_map)

# Step 4: Add markers for each restaurant
for i, row in restaurant_df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"""
        <b>Area:</b> {row['listed_incity']}<br>
        <b>Cuisines:</b> {row['cuisines']}<br>
        <b>Rating:</b> {row['rate']}
        """,
        icon=folium.Icon(color='blue', icon='cutlery', prefix='fa')
    ).add_to(marker_cluster)

# Step 5: Save the map as HTML
density_map.save("restaurant_density_map.html")

# Step 6: Display the map in the notebook
IFrame("restaurant_density_map.html", width=700, height=500)

# 9. Cuisine-Specific Map (Italian Restaurants)

In [99]:
# Step 1 & 2: Filter data for Italian cuisine
italian_df = merged_df[merged_df['cuisines'].str.contains('Italian', na=False)]

# Drop rows with missing coordinates
italian_df = italian_df.dropna(subset=['Latitude', 'Longitude'])

# Step 3: Initialize map centered on Bangalore
italian_map = folium.Map(location=[12.9716, 77.5946], zoom_start=12)

# Step 4: Add markers using MarkerCluster
marker_cluster = MarkerCluster().add_to(italian_map)

for i, row in italian_df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"""
        <b>City:</b> {row['listed_incity']}<br>
        <b>Cuisines:</b> {row['cuisines']}
        """,
        icon=folium.Icon(color='purple', icon='cutlery', prefix='fa')
    ).add_to(marker_cluster)

# Step 6: Save the map
italian_map.save("restaurant_density.html")

# Step 7: Display in notebook
IFrame("restaurant_density.html", width=700, height=500)
