# 1. Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "svg"

# 2. Importing the dataset

In [2]:
raw_df = pd.read_csv('ChennaiZomato.csv',delimiter='|')

In [3]:
raw_df

Unnamed: 0,"Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features"
0,"Yaa Mohaideen Briyani,Pallavaram,['Biryani'],""..."
1,"Sukkubhai Biriyani,Alandur,['Biryani',""['Beef ..."
2,"Sukkubhai Biriyani,Alandur, ' North Indian',""[..."
3,"Sukkubhai Biriyani,Alandur, ' Mughlai',""['Beef..."
4,"Sukkubhai Biriyani,Alandur, ' Desserts',""['Bee..."
...,...
12306,"Engineers Kitchen,Ambattur,['North Indian',Inv..."
12307,"Engineers Kitchen,Ambattur, ' Chinese',Invalid..."
12308,"Engineers Kitchen,Ambattur, ' Andhra'],Invalid..."
12309,"That Juice Shop,Karapakkam,['Juices',Invalid,2..."


In [4]:
raw_df.head()

Unnamed: 0,"Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features"
0,"Yaa Mohaideen Briyani,Pallavaram,['Biryani'],""..."
1,"Sukkubhai Biriyani,Alandur,['Biryani',""['Beef ..."
2,"Sukkubhai Biriyani,Alandur, ' North Indian',""[..."
3,"Sukkubhai Biriyani,Alandur, ' Mughlai',""['Beef..."
4,"Sukkubhai Biriyani,Alandur, ' Desserts',""['Bee..."


In [5]:
print(raw_df.columns)

Index(['Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features'], dtype='object')


# 3. Getting Basic Information about the Dataset

In [6]:
raw_df.shape

(12311, 1)

In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12311 entries, 0 to 12310
Data columns (total 1 columns):
 #   Column                                                                                                                                       Non-Null Count  Dtype 
---  ------                                                                                                                                       --------------  ----- 
 0   Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features  12311 non-null  object
dtypes: object(1)
memory usage: 96.3+ KB


In [8]:
raw_df.describe()

Unnamed: 0,"Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features"
count,12311
unique,12311
top,"That Juice Shop,Karapakkam, ' Beverages'],Inva..."
freq,1


In [9]:
raw_df.isnull().sum()

Name of Restaurant,Location,Cuisine,Top Dishes,Price for 2,Dining Rating,Dining Rating Count,Delivery Rating,Delivery Rating Count,Features    0
dtype: int64

# 4. cleaning the dataset

In [10]:
 # It seems the 'Cuisine' column contains lists (e.g., ['Biryani', 'North Indian']) or strings. We'll need to handle it properly.
# Ensure 'Cuisine' is a string and handle the string to list conversion
# If there are extra spaces around column names, strip them
raw_df.columns = raw_df.columns.str.strip()
# 3. Check for the presence of the Cuisine column.
# Once spaces are stripped, check again if the Cuisine column is present.
 if 'Cuisine' in raw_df.columns:
       print("Cuisine column exists")
else:
       print("Cuisine column not found")
# Handle the Cuisine column:
raw_df['Cuisine'] = raw_df['Cuisine'].apply(lambda x: x.strip("[]").replace("'", "").split(',') if isinstance(x, str) else x)
# Now let's handle the case where multiple cuisines are listed per restaurant
exploded_df = raw_df.explode('Cuisine')
# Ensure no 'Invalid' values remain
exploded_df.replace('Invalid', pd.NA, inplace=True)
# Finally, check the DataFrame
print(exploded_df.head())

                   



Cuisine column not found


KeyError: 'Cuisine'

In [None]:
# Convert numeric columns to appropriate types
raw_df['Price for 2'] = pd.to_numeric(raw_df['Price for 2'], errors='coerce')
raw_df['Dining Rating'] = pd.to_numeric(raw_df['Dining Rating'], errors='coerce')
raw_df['Delivery Rating'] = pd.to_numeric(raw_df['Delivery Rating'], errors='coerce')


In [None]:
raw_df.info()

In [None]:
# Handle missing values in numeric columns
raw_df.fillna({'Price for 2': raw_df['Price for 2'].mean(),
                   'Dining Rating': raw_df['Dining Rating'].mean(),
                   'Delivery Rating': raw_df['Delivery Rating'].mean()}, inplace=True)

In [None]:
# Remove duplicate rows
raw_df.drop_duplicates(inplace=True)

In [None]:
# Convert categorical columns to numeric codes if needed
raw_df['Location'] = raw_df['Location'].astype('category').cat.codes
raw_df['Top Dishes'] = raw_df['Top Dishes'].astype('category').cat.codes
raw_df['Features'] = raw_df['Features'].astype('category').cat.codes


# 4. Copying the cleaned data into a new DataFrame

In [None]:
zomato_df = raw_df.copy()

In [None]:
zomato_df.head()

 # 5. Performing Exploratory Data Analysis
Q1) How many restaurants are in Mumbai for each type of cuisine?

In [None]:
fig = px.histogram(zomato_df, x='CUSINE TYPE', color='CUSINE TYPE',
title= 'No. of Restaurants by Cuisine Type',
labels={'CUSINE TYPE':'Cuisine Type'})
fig.show()

In [None]:
raw_df.columns

In [None]:
# Q1) How many restaurants are in Chennai for each type of cuisine?

In [None]:
fig = px.histogram(raw_df, x='Cuisine', color='Cuisine',
title= 'No. of Restaurants by Cuisine Type',
labels={'Cusine Type ':'Cuisine'})
fig.show()

In [None]:
# Check if the 'Cuisine' column contains lists or strings of multiple cuisines per restaurant
# If Cuisine is a list, we need to explode it to count each cuisine separately
# Assuming 'Cuisine' column contains strings like "['Biryani', 'North Indian']" or just 'Biryani'
# Step 1: Convert 'Cuisine' column to actual lists if they are stored as strings
raw_df['Cuisine'] = raw_df['Cuisine'].apply(lambda x: x.strip("[]").replace("'", "").split(',') if isinstance(x, str) else x)
# Step 2: Explode the 'Cuisine' column to handle multiple cuisines per restaurant
exploded_df = raw_df.explode('Cuisine')
# Step 3: Create the histogram to count the number of restaurants by cuisine type
fig = px.histogram(exploded_df, x='Cuisine', color='Cuisine', 
                   title='No. of Restaurants by Cuisine Type',
                   labels={'Cuisine': 'Cuisine Type'})  # Label the x-axis for clarity
# Show the plot
fig.show()

In [None]:
rating_type_df = raw_df['Rating Count'].value_counts().reset_index()