# Zomato Restaurant Data – Exploratory Data Analysis

This notebook is a ready-to-run template for a Data Cleaning & EDA project. Replace `data/zomato.csv` with your dataset and run the cells.



In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import os

#sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

# Paths
DATA_PATH = "zomato.csv"

In [10]:
# ------------------------------
# Load dataset
# ------------------------------
if not os.path.exists(DATA_PATH):
    print(f" Dataset not found at {DATA_PATH}. Please place your dataset there (or update DATA_PATH).")
else:
    df = pd.read_csv(DATA_PATH)
    print(" Dataset loaded. Shape:", df.shape)
    display(df.head())


 Dataset loaded. Shape: (51717, 17)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [11]:
try:
    display(df.info())
    display(df.describe(include='all').T)
    print("Missing values per column:\n", df.isnull().sum())
except NameError:
    print("Run the dataset load cell first.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

None

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
url,51717.0,51717.0,https://www.zomato.com/bangalore/the-nest-the-...,1.0,,,,,,,
address,51717.0,11495.0,Delivery Only,128.0,,,,,,,
name,51717.0,8792.0,Cafe Coffee Day,96.0,,,,,,,
online_order,51717.0,2.0,Yes,30444.0,,,,,,,
book_table,51717.0,2.0,No,45268.0,,,,,,,
rate,43942.0,64.0,NEW,2208.0,,,,,,,
votes,51717.0,,,,283.697527,803.838853,0.0,7.0,41.0,198.0,16832.0
phone,50509.0,14926.0,080 43334321,216.0,,,,,,,
location,51696.0,93.0,BTM,5124.0,,,,,,,
rest_type,51490.0,93.0,Quick Bites,19132.0,,,,,,,


Missing values per column:
 url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64


In [12]:
# ------------------------------
# Data cleaning
# ------------------------------
# Note: Update column names below if your dataset uses different names.

try:
    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Example columns - change if necessary
    if 'cuisine' in df.columns:
        df['cuisine'] = df['cuisine'].fillna('Unknown')

    if 'rating' in df.columns:
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
        df = df[df['rating'] <= 5]  # remove impossible ratings (>5)

    if 'city' in df.columns:
        df['city'] = df['city'].astype(str).str.title().str.strip()

    print("Cleaning applied. New shape:", df.shape)
    display(df.head())
except NameError:
    print("Run the dataset load cell first.")


Cleaning applied. New shape: (51717, 17)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [13]:
# ------------------------------
# Univariate analysis
# ------------------------------
try:
    if 'rating' in df.columns:
        plt.figure(figsize=(8,5))
        sns.histplot(df['rating'].dropna(), bins=20, kde=True)
        plt.title("Distribution of Ratings")
        plt.show()
    if 'cuisine' in df.columns:
        plt.figure(figsize=(10,5))
        df['cuisine'].value_counts().head(10).plot(kind='bar')
        plt.title("Top 10 Cuisines")
        plt.show()
except NameError:
    print("Run the dataset load and cleaning cells first.")


In [14]:
# ------------------------------
# Bivariate analysis
# ------------------------------
try:
    if 'city' in df.columns and 'rating' in df.columns:
        plt.figure(figsize=(12,6))
        sns.boxplot(x='city', y='rating', data=df)
        plt.xticks(rotation=45)
        plt.title("Ratings by City")
        plt.show()

    if 'votes' in df.columns and 'rating' in df.columns:
        plt.figure(figsize=(8,5))
        sns.scatterplot(x='votes', y='rating', data=df)
        plt.title("Votes vs Rating")
        plt.show()
except NameError:
    print("Run the dataset load and cleaning cells first.")


In [15]:
# ------------------------------
# Multivariate analysis
# ------------------------------
try:
    numeric_df = df.select_dtypes(include=[np.number])
    if not numeric_df.empty:
        plt.figure(figsize=(8,6))
        sns.heatmap(numeric_df.corr(), annot=True)
        plt.title("Correlation Heatmap")
        plt.show()
    else:
        print("No numeric columns to compute correlations.")
except NameError:
    print("Run the dataset load and cleaning cells first.")


Run the dataset load and cleaning cells first.


<Figure size 800x600 with 0 Axes>

In [16]:
# ------------------------------
# Save cleaned data
# ------------------------------
try:
    os.makedirs("data", exist_ok=True)  # Ensure 'data' folder exists
    cleaned_path = os.path.join("data", "zomato_cleaned.csv")
    df.to_csv(cleaned_path, index=False)
    print(f" Cleaned data saved to {cleaned_path}")
except NameError:
    print("Run the dataset load and cleaning cells first.")
