In [None]:
#core libraries

import numpy as np 
import pandas as pd

#visualization libraries

import matplotlib.pyplot as plt
import seaborn as sns 


In [None]:
#load dataset

df= pd.read_csv(r"C:\Users\Vrishikaa\Downloads\archive (2)\swiggy.csv")
df.head(2)


In [None]:
# to remove duplicates 

df.duplicated().sum()
df.drop_duplicates(inplace=True)


In [None]:
#drop duplicates from the column 

df = df.drop_duplicates()


In [None]:
#to check if duplicates are droped or not 

df.head(2)


In [None]:
#need to get the number of rows and columns

df.shape


In [None]:
(8680, 10)

In [None]:
#information from the data-set 

df.info()


In [None]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8680 entries, 0 to 8679
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             8680 non-null   int64  
 1   Area           8680 non-null   object 
 2   City           8680 non-null   object 
 3   Restaurant     8680 non-null   object 
 4   Price          8680 non-null   float64
 5   Avg ratings    8680 non-null   float64
 6   Total ratings  8680 non-null   int64  
 7   Food type      8680 non-null   object 
 8   Address        8680 non-null   object 
 9   Delivery time  8680 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 678.3+ KB


In [None]:
#to get the idea of all columns by mathematical insights

df.describe()


In [None]:
#to find the missing values

df.isnull().sum()


In [None]:
ID               0
Area             0
City             0
Restaurant       0
Price            0
Avg ratings      0
Total ratings    0
Food type        0
Address          0
Delivery time    0
dtype: int64

In [None]:
#drop column 

df.drop('Address',axis=1,inplace=True)


In [None]:
df.head(1)


In [None]:
df.set_index(['City'],inplace=True)
df


In [None]:
print(df.columns.tolist())


In [None]:
['ID', 'Area', 'Restaurant', 'Price', 'Avg ratings', 'Total ratings', 'Food type', 'Delivery time']


In [None]:
df


In [None]:
# Create a correlation matrix of all numeric columns in the DataFrame
# numeric_only=True ensures only numbers are used (ignores text columns)

# Plot the heatmap using seaborn
# annot=True -> shows the correlation values inside each cell

# cmap='coolwarm' -> sets the color scheme (blue = negative, red = positive)

sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
#we need to form the group with city and with the view of all columns both numeric and non-numeric 
City_df = df.groupby(['City', 'Area']).agg({
    'Price': 'mean',
    'Avg ratings': 'mean',
    'Total ratings': 'mean',
    'Delivery time': 'mean',
    'Restaurant': lambda x: ', '.join(map(str, x.unique()[:3])),  # ensure string
    'Food type': lambda x: ', '.join(map(str, x.unique()[:3]))
}).reset_index().round(2)

print(City_df.head())


In [None]:
        City                  Area   Price  Avg ratings  Total ratings  \
0  Ahmedabad   Akhbar Nagar Circle  200.00         2.90          80.00   
1  Ahmedabad                 Acher  200.00         3.70         100.00   
2  Ahmedabad             Ahmedabad  344.44         4.07         124.44   
3  Ahmedabad              Ambavadi  200.00         4.40          20.00   
4  Ahmedabad              Ambawadi  302.00         3.72          61.60   

   Delivery time                                         Restaurant  \
0          53.00                              Shiv Shakti Fast Food   
1          70.00                                Punjabi Food On Way   
2          43.22  Prithvi Hotel, Fish Express, Grill N Rice Rest...   
3          39.00                                        Mk Sandwich   
4          38.56  Harshu'S Late Night Munchies, Umami By Curries...   

                                           Food type  
0                                 Gujarati,Fast Food  
1                 North Indian,Chinese,Punjabi,Combo  
2  Indian,Chinese,Continental, Indian,Tandoor,Sea...  
3                         Chinese,American,Beverages  
4  Fast Food,Italian,Chinese,Snacks, Indian,Chine...  


In [None]:
#to reset index asper the group of city 

City_df = City_df.reset_index()


In [None]:
City_df 


In [None]:
#to see the City_df to check the groupby fuction given 

City_df
City_df.shape


In [None]:
(843, 9)

In [None]:
#to identify the top performing area per city
# Sort the dataset by 'City' first and then 'Avg ratings'
# - Sorting by 'City' keeps all areas of the same city together.
# - Sorting by 'Avg ratings' in descending order (False) ensures that
#   the top-rated areas appear first within each city.

City_df.sort_values(['City','Avg ratings'],ascending=[True,False])


In [None]:
# 📌 Step 1: Sort the dataset by 'City' first and then 'Avg ratings'
# - Sorting by 'City' keeps all areas of the same city together.
# - Sorting by 'Avg ratings' in descending order (False) ensures that
#   the top-rated areas appear first within each city.

City_sorted = City_df.sort_values(['City','Avg ratings'], 
                                  ascending=[True, False])

# Display the sorted table
City_sorted.head(10)


In [None]:
# 📌 Step 2: Extract the *Top Performing Area* for each City
# - After sorting, the top row for each city will represent
#   the area with the highest average rating.
# - groupby('City').head(1) picks the first row per city group.

top_area_per_city = City_sorted.groupby('City').head(1).reset_index()

# Display results
top_area_per_city


In [None]:
# 📊 Step 3: Visualize the Top Performing Area per City

plt.figure(figsize=(8,5))

# Plot bar chart of top areas' average ratings

plt.bar(top_area_per_city['City'], top_area_per_city['Avg ratings'], 
        color='skyblue', edgecolor='black')

# Add labels and title

plt.title("Top Performing Area per City (Based on Avg Ratings)")
plt.xlabel("City")
plt.ylabel("Average Rating")
plt.xticks(rotation=45, ha='right')

# Annotate bars with exact rating values

for i, val in enumerate(top_area_per_city['Avg ratings']):
    plt.text(i, val + 0.02, str(val), ha='center', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
# Group the dataset by 'City'
# Then calculate summary statistics for each city

city_group = df.groupby('City').agg({
    'Price': 'mean',           # Average price of items in each city
    'Avg ratings': 'mean',     # Average customer rating in each city
    'Total ratings': 'sum',    # Total number of ratings given in each city
    'Delivery time': 'mean'    # Average delivery time in each city
}).round(2)                    # Round the results to 2 decimal places for readability

# Print the grouped summary DataFrame

print(city_group)


In [None]:
            Price  Avg ratings  Total ratings  Delivery time
City                                                        
Ahmedabad  318.13         3.60          74470          44.71
Bangalore  382.52         3.76         140500          50.53
Chennai    356.25         3.78         178860          58.97
Delhi      333.30         3.53          81420          50.73
Hyderabad  299.93         3.70         330270          49.93
Kolkata    362.29         3.70         219800          67.81
Mumbai     393.79         3.60         150960          48.32
Pune       353.76         3.55         122990          55.85
Surat      270.17         3.58          60320          48.48


In [None]:
# Group the dataset by 'City'
# and count the number of UNIQUE restaurants in each city

city_restaurants = df.groupby('City')['Restaurant'].nunique().reset_index()

# Rename the columns for better readability

city_restaurants.columns = ['City', 'Unique Restaurants']

# Print the result

print(city_restaurants)


In [None]:
        City  Unique Restaurants
0  Ahmedabad                 709
1  Bangalore                 938
2    Chennai                1096
3      Delhi                 611
4  Hyderabad                1030
5    Kolkata                1325
6     Mumbai                1253
7       Pune                1080
8      Surat                 505


In [None]:
# Group the dataset by 'City'
# and count the number of UNIQUE restaurants in each city

city_restaurants = df.groupby('City')['Restaurant'].nunique().reset_index()

# Rename the columns for better readability

city_restaurants.columns = ['City', 'Unique Restaurants']

# Print the result

print(city_restaurants)

# ---- Visualization ----

plt.figure(figsize=(7,5))
plt.bar(city_restaurants['City'], city_restaurants['Unique Restaurants'], color='coral', edgecolor='black')

plt.title("Unique Restaurants by City")
plt.xlabel("City")
plt.ylabel("Number of Unique Restaurants")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
        City  Unique Restaurants
0  Ahmedabad                 709
1  Bangalore                 938
2    Chennai                1096
3      Delhi                 611
4  Hyderabad                1030
5    Kolkata                1325
6     Mumbai                1253
7       Pune                1080
8      Surat                 505


In [None]:
# Group the dataset by 'City' and look at the 'Food type' column
# For each city, count how many times each cuisine appears
# Then take the top 3 most common cuisines using .head(3)

city_food = df.groupby('City')['Food type'].apply(lambda x: x.value_counts().head(3))

# Print the result

print(city_food)


In [None]:
City                   
Ahmedabad  Indian          53
           North Indian    30
           Fast Food       26
Bangalore  South Indian    32
           Indian          27
           North Indian    25
Chennai    Indian          56
           South Indian    49
           Fast Food       26
Delhi      North Indian    47
           Indian          27
           Chinese         17
Hyderabad  South Indian    76
           Indian          38
           Chinese         31
Kolkata    Indian          66
           Chinese         52
           Fast Food       28
Mumbai     Chinese         64
           Indian          55
           Fast Food       32
Pune       Chinese         48
           Indian          47
           Fast Food       46
Surat      Fast Food       40
           Indian          20
           North Indian    18
Name: Food type, dtype: int64


In [None]:
# Group the dataset by 'City'
# Then select the 'Avg ratings' column
# Calculate the mean rating for each city
# Sort the results in ascending order (lowest rating city first)
# Finally, plot the results as a bar chart

df.groupby('City')['Avg ratings'].mean().sort_values().plot(
    kind='bar',
    title='Mean of Customer Ratings per City',
    figsize=(5,4),
    color='skyblue',
    edgecolor='black'
)
plt.xlabel('City')
plt.ylabel('Average Rating')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
#to get the highest-rated area in each city

top_areas = City_df.groupby('City').head(1).reset_index()
top_areas


In [None]:
#to visualse the histogram with price

sns.histplot(df['Price'], bins=30, kde=True)
plt.title("Price Distribution")
plt.show()


In [None]:
# Count how many times each restaurant appears in the dataset
# This helps identify the most frequently listed or ordered restaurants

df['Restaurant'].value_counts().head(10).plot(kind='bar')

# Set the title of the chart to describe what it shows

plt.title("Top 10 Restaurants by Count")

# Display the bar chart

plt.show()


In [None]:
# Create a boxplot to visualize the distribution of average ratings across cities
# This helps identify median ratings, variability, and outliers for each city

sns.boxplot(x='City', y='Avg ratings', data=df)

# Add a title to describe the chart

plt.title("Ratings Distribution by City")

# Rotate x-axis labels for better readability

plt.xticks(rotation=45)

# Display the plot

plt.show()


In [None]:
# Group the dataset by 'City' and calculate the average delivery time

city_delivery = df.groupby('City')['Delivery time'].mean().sort_values()

# Plot the results as a line chart

city_delivery.plot(
    kind='line',
    marker='o',                  # Adds markers to each data point
    color='teal',
    linewidth=2,
    figsize=(6, 4),
    title='Average Delivery Time by City'
)

# Add axis labels and styling

plt.xlabel('City')
plt.ylabel('Average Delivery Time (minutes)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
