<a href="https://colab.research.google.com/github/TVSSSoureesh/Data-Analysis-on-Swiggy-Restaurant-Dataset/blob/main/Swiggy_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
rrkcoder_swiggy_restaurants_dataset_path = kagglehub.dataset_download('rrkcoder/swiggy-restaurants-dataset')

print('Data source import complete.')


<div style="padding: 20px;border-radius: 16px;background-color: #FF3CAC;
background-image: linear-gradient(225deg, #FF3CAC 0%, #784BA0 50%, #2B86C5 100%);
color: white;text-align:center;font-family: Inter,sans-serif;">
        <h4 style="font-size: 24px;">Swiggy Restaurants Dataset</h4>
    </div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:

df = pd.read_csv('/kaggle/input/swiggy-restaurants-dataset/swiggy_file.csv')

In [None]:
df

<div style="padding: 20px;border-radius: 16px;background-color: #FF3CAC;
background-image: linear-gradient(225deg, #FF3CAC 0%, #784BA0 50%, #2B86C5 100%);
color: white;text-align:center;font-family: Inter,sans-serif;">
        <h4 style="font-size: 24px;">Data Cleaning</h4>
    </div>

In [None]:
#The average price is replaced from 'â‚¹50 for one' to 50
df['Average Price'] = df['Average Price'].str.extract('(\d+)').fillna(0).astype(int)


In [None]:
df

In [None]:
#The restaurants with rating '-' is replaced with zero
df['Rating'] = df['Rating'].replace('-', '0')


In [None]:
#The restaurants with rating '-' is replaced with zero
df['Number of Ratings'] = df['Number of Ratings'].replace('Too Few Ratings', '0')


In [None]:
df

In [None]:
df['Number of Ratings'] = df['Number of Ratings'].str.replace(r'\+.*', '', regex=True)

In [None]:
df

In [None]:
df['Number of Ratings'] = df['Number of Ratings'].str.replace(r'\D', '', regex=True)

In [None]:
df

In [None]:
df['Offer Name'] = df['Offer Name'].str.replace('\n', ',')

In [None]:
df

<div style="padding: 20px;border-radius: 16px;background-color: #FF3CAC;
background-image: linear-gradient(225deg, #FF3CAC 0%, #784BA0 50%, #2B86C5 100%);
color: white;text-align:center;font-family: Inter,sans-serif;">
        <h4 style="font-size: 24px;">Exploratory Data Analysis (EDA)</h4>
    </div>

In [None]:
pure_veg_counts = df['Pure Veg'].value_counts().reset_index()
pure_veg_counts.columns = ['Pure Veg', 'Count']

# Plot a pie chart
fig = px.pie(
    pure_veg_counts,
    names='Pure Veg',
    values='Count',
    title='Distribution of Pure Vegetarian Restaurants',
    color_discrete_sequence=px.colors.qualitative.Set3,
    template='seaborn'
)

fig.show()

In [None]:
import matplotlib.pyplot as plt

df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating Category'] = df['Rating'].apply(lambda x: '0-3' if pd.isna(x) or x < 3 else ('3-4' if x < 4 else '4+'))
rating_counts = df['Rating Category'].value_counts()
colors = ['gold', 'lightcoral', 'lightskyblue']

plt.figure(figsize=(8, 8))
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Distribution of Ratings')
plt.show()


In [None]:
#After getting the unique prices, we can divide them into different categories
unique_prices = [50, 100, 150, 200, 250, 300, 500, 400, 350, 450, 40, 0, 550, 48, 1, 600, 32, 900, 750, 800, 650, 10, 12, 850, 2]

# Define the price ranges
price_ranges = {
    'Less than 50': lambda x: x < 50,
    '100-200': lambda x: 100 <= x < 200,
    '200-300': lambda x: 200 <= x < 300,
    '300-400': lambda x: 300 <= x < 400,
    '400-500': lambda x: 400 <= x < 500,
    '500-700': lambda x: 500 <= x < 700,
    'Greater than 700': lambda x: x > 700
}

# Count the occurrences in each price range
price_counts = {range_name: sum(price_range(price) for price in unique_prices) for range_name, price_range in price_ranges.items()}


labels = price_counts.keys()
sizes = price_counts.values()

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

plt.title('Distribution of Average Prices')
plt.show()


In [None]:
#To find all possible Cuisine in the dataset
cuisine_column = df['Cuisine']

all_cuisines = [cuisine.split(', ') for cuisine in cuisine_column if pd.notna(cuisine)]
unique_cuisines = set([c for sublist in all_cuisines for c in sublist])

print("All Possible Cuisines:", unique_cuisines)


In [None]:
import pandas as pd
import plotly.express as px

cuisine_counts = df['Cuisine'].str.split(', ').explode().value_counts()
top_15_cuisines = cuisine_counts.head(15)
plot_data = pd.DataFrame({'Cuisine': top_15_cuisines.index, 'Frequency': top_15_cuisines.values})
colors = px.colors.qualitative.Set3

fig = px.bar(plot_data, x='Frequency', y='Cuisine', color='Cuisine',
             color_discrete_sequence=colors,
             orientation='h', labels={'Frequency': 'Frequency'},
             title='Top 15 Cuisines', width=800, height=500)

fig.update_layout(showlegend=False)
fig.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

cuisine_text = ','.join(df['Cuisine'].dropna())

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cuisine_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

average_price = df['Average Price']
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Define rating categories
def categorize_rating(rating):
    if rating < 3:
        return '<3'
    elif 3 <= rating < 4:
        return '3-4'
    else:
        return '4+'

df['Rating Category'] = df['Rating'].apply(categorize_rating)

# Define a color palette for each category
palette = {'<3': 'red', '3-4': 'orange', '4+': 'green'}

plt.figure(figsize=(10, 6))
sns.scatterplot(x=average_price, y=df['Rating'], hue=df['Rating Category'], palette=palette, alpha=0.7)
plt.title('Relation between Average Price and Rating')
plt.xlabel('Average Price')
plt.ylabel('Rating')
plt.legend(title='Rating Category')
plt.show()


In [None]:
import pandas as pd
import plotly.express as px
from geopy.geocoders import ArcGIS


df = df[df['Location'] != '-']
restaurant_count = df.groupby("Location")["Restaurant Name"].count().reset_index(name='count')

nom = ArcGIS()
restaurant_count[["Lat", "Long"]] = restaurant_count['Location'].apply(lambda x: nom.geocode(x)[1]).apply(pd.Series)

fig = px.scatter_mapbox(
    restaurant_count,
    lon='Long',
    lat='Lat',
    zoom=3,
    color='Location',
    size='count',
    title='Restaurant Distribution Across Locations',
    mapbox_style='open-street-map',
)

fig.update_layout(margin={"r": 0, 't': 50, 'l': 0, 'b': 10})
fig.show()


In [None]:
import pandas as pd
import plotly.express as px
from geopy.geocoders import ArcGIS

# Assuming your data is in a DataFrame named 'df'
# Replace 'Location' with the actual column name from your dataset

# Filter out rows with '-' location (if applicable)
df = df[df['Location'] != '-']

# Group by Location and get the count of restaurants
restaurant_count = df.groupby("Location")["Restaurant Name"].count().reset_index(name='count')

# Geocode locations using ArcGIS
nom = ArcGIS()
restaurant_count[["Lat", "Long"]] = restaurant_count['Location'].apply(lambda x: nom.geocode(x)[1]).apply(pd.Series)

# Create a heatmap on Mapbox
fig = px.density_mapbox(
    restaurant_count.dropna(subset=['Lat', 'Long']),
    lat='Lat',
    lon='Long',
    z='count',
    radius=10,  # Adjust the radius as needed
    zoom=4,
    mapbox_style="carto-positron",  # You can try other mapbox styles
    title='Restaurant Heatmap in India',
)

# Show the plot
fig.show()
