#### EDA Portfolio Project 4 by USMAN GHULAM NABI (W-53)

### Designing a Marketing Campaign for a Restaurant Chain Using Exploratory Data Analysis

#### Objective:
To utilize exploratory data analysis (EDA) skills to understand customer preferences,dining trends, and competitive landscape in various 
regions of India, and to design an effective marketing campaign for a restaurant chain.

In [None]:
# import libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns


#### Tasks:

#### 01. Data Cleaning and Preparation:

● Identify and handle missing values.

● Detect and correct any inconsistencies in the dataset (e.g., data types,mislabeled categories).

● Feature engineering (if necessary), like extracting useful information from existing data.

In [None]:
# import Dataset

df=pd.read_csv('zomato_restaurants_in_India.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df.columns

In [None]:
msno.bar(df)

In [None]:
msno.heatmap(df)

In [None]:
df.duplicated().sum()

In [None]:
df1=df.copy()

In [None]:
df.drop_duplicates(keep='first',inplace=True)

#### 02. Exploratory Data Analysis:

● Descriptive Statistics: Summarize the central tendency, dispersion, and shape of the dataset's distribution.

● Distribution Analysis: Analyze the distribution of key variables (e.g., ratings, price range, cuisines).

● Correlation Analysis: Examine the relationships between different variables.

i. Descriptive Statistics

In [None]:
df.shape

In [None]:
# Summary statistics for numerical columns
summary_stats = df.describe()
summary_stats

In [None]:
df.describe(include=object)

In [None]:
# Skewness for numerical columns
skewness_numeric = df.select_dtypes(include=['int64', 'float64']).skew()
skewness_numeric 



In [None]:
# Kurtosis for numerical columns
kurtosis_numeric = df.select_dtypes(include=['int64', 'float64']).kurt()
kurtosis_numeric 


#### ii. Distribution Analysis: 

In [None]:
# Rating Distribution


# Histogram of aggregate ratings
plt.hist(df['aggregate_rating'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Aggregate Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Aggregate Ratings')
plt.show()

# Box plot of aggregate ratings
plt.boxplot(df['aggregate_rating'])
plt.ylabel('Aggregate Rating')
plt.title('Box Plot of Aggregate Ratings')
plt.show()


In [None]:
# Price Range Distribution

# Value counts of price ranges
price_range_counts = df['price_range'].value_counts()

# Bar plot of price range distribution
price_range_counts.plot(kind='bar', color='lightgreen')
plt.xlabel('Price Range')
plt.ylabel('Count')
plt.title('Distribution of Price Ranges')
plt.show()


In [None]:
# Cuisine Distribution
# Value counts of cuisines
cuisine_counts = df['cuisines'].value_counts().head(10)  # Considering top 10 cuisines

# Bar plot of cuisine distribution
cuisine_counts.plot(kind='bar', color='salmon')
plt.xlabel('Cuisine')
plt.ylabel('Count')
plt.title('Top 10 Cuisines')
plt.xticks(rotation=45, ha='right')
plt.show()


iii. Correlation Analysis:

In [None]:

numeric_columns = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()


# Visualize correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


#### 03. Regional Analysis:

● Compare the restaurant trends and customer preferences across different cities or regions in India.

● Identify unique characteristics of the dining scene in each region.

In [None]:
# Compare the restaurant trends and customer preferences across different cities or regions in India.


# Group data by city
city_groups = df.groupby('city')

# Analyze popular cuisines by city
top_cuisines_by_city = city_groups['cuisines'].value_counts().groupby(level=0).nlargest(5)

# Calculate average cost for two by city
avg_cost_by_city = city_groups['average_cost_for_two'].mean()

# Calculate average ratings by city
avg_ratings_by_city = city_groups['aggregate_rating'].mean()

# Calculate top 5 cuisines in each city
top_cuisines_by_city = city_groups['cuisines'].value_counts().groupby(level=0).nlargest(5).reset_index(level=0, drop=True)

# Combine top cuisines data from all cities
combined_top_cuisines = top_cuisines_by_city.groupby('cuisines').sum().nlargest(5)

# Plot top cuisines for all cities
plt.figure(figsize=(10, 6))
combined_top_cuisines.plot(kind='barh', color=plt.cm.viridis.colors)
plt.title('Top 5 Cuisines Across Cities')
plt.xlabel('Count')
plt.ylabel('Cuisine')
plt.tight_layout()
plt.show()

# Visualize average cost for two by city
plt.figure()
avg_cost_by_city.plot(kind='bar', color='lightgreen')
plt.title('Average Cost for Two by City')
plt.xlabel('City')
plt.ylabel('Average Cost for Two')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Visualize average ratings by city
plt.figure()
avg_ratings_by_city.plot(kind='bar', color='salmon')
plt.title('Average Ratings by City')
plt.xlabel('City')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Identify unique characteristics of the dining scene in each region.

# Group the data by 'city'
city_groups = df.groupby('city')

# Analyze popular cuisines in each city
top_cuisines_by_city = city_groups['cuisines'].value_counts().groupby(level=0).nlargest(5)

# Calculate average cost for two by city
avg_cost_by_city = city_groups['average_cost_for_two'].mean()

# Identify popular types of restaurant establishments in each city
top_establishments_by_city = city_groups['establishment'].value_counts().groupby(level=0).nlargest(5)

# Examine average ratings of restaurants in each city
avg_ratings_by_city = city_groups['aggregate_rating'].mean()

# Display the results
print("Top 5 Cuisines in Each City:")
print(top_cuisines_by_city)
print("\nAverage Cost for Two by City:")
print(avg_cost_by_city)
print("\nTop 5 Establishments in Each City:")
print(top_establishments_by_city)
print("\nAverage Ratings by City:")
print(avg_ratings_by_city)


#### 04. Customer Preference Analysis:
● Analyze the types of cuisines that are popular in different regions.

● Examine the relationship between restaurant ratings, price range, and popularity.

In [None]:
# Analyze the types of cuisines that are popular in different regions.

# Group the data by 'city' or 'region'
region_groups = df.groupby('city')  # Change 'city' to 'region' if the dataset uses 'region' for grouping

# Analyze the frequency of cuisines in each region
top_cuisines_by_region = region_groups['cuisines'].value_counts().groupby(level=0).nlargest(5)

# Visualize the results
for region, top_cuisines in top_cuisines_by_region.groupby(level=0):
    plt.figure(figsize=(10, 6))
    top_cuisines.plot(kind='bar', color='skyblue')
    plt.title(f'Top 5 Cuisines in {region}')
    plt.xlabel('Cuisine')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Examine the relationship between restaurant ratings, price range, and popularity.

# Select relevant columns
columns = ['aggregate_rating', 'price_range', 'votes']
data = df[columns]

# Compute correlation matrix
correlation_matrix = data.corr()

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix: Ratings, Price Range, and Popularity')
plt.show()

# Scatter plot: Ratings vs. Price Range
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='aggregate_rating', y='price_range', hue='votes', palette='viridis', alpha=0.7)
plt.title('Scatter Plot: Ratings vs. Price Range (Colored by Popularity)')
plt.xlabel('Aggregate Rating')
plt.ylabel('Price Range')
plt.show()


#### 05. Competitive Analysis:

● Identify major competitors in each region based on cuisine, pricing, and ratings.

● Analyze the strengths and weaknesses of these competitors.

In [None]:
# Identify major competitors in each region based on cuisine, pricing, and ratings.

# Group the data by 'region'
region_groups = df.groupby('city')  # Change 'city' to 'region' if the dataset uses 'region' for grouping

# Analyze cuisines, pricing, and ratings in each region
competitors = {}
for region, data in region_groups:
    # Identify major cuisines
    top_cuisines = data['cuisines'].value_counts().nlargest(3).index.tolist()
    
    # Analyze pricing distribution
    pricing_distribution = data['average_cost_for_two'].describe()
    
    # Calculate average ratings
    avg_rating = data['aggregate_rating'].mean()
    
    # Store information
    competitors[region] = {
        'Top Cuisines': top_cuisines,
        'Pricing Distribution': pricing_distribution,
        'Average Rating': avg_rating
    }

# Display the results
for region, info in competitors.items():
    print(f"Region: {region}")
    print(f"Top Cuisines: {', '.join(info['Top Cuisines'])}")
    print("Pricing Distribution:")
    print(info['Pricing Distribution'])
    print(f"Average Rating: {info['Average Rating']:.2f}")
    print("\n")


In [None]:
# Analyze the strengths and weaknesses of these competitors.

# Group the data by 'region'
region_groups = df.groupby('city')  # Change 'city' to 'region' if the dataset uses 'region' for grouping

# Analyze strengths and weaknesses of competitors in each region
competitor_analysis = {}
for region, data in region_groups:
    # Calculate strengths
    strengths = {
        'Variety of Cuisines': data['cuisines'].nunique(),
        'Average Price Range': data['average_cost_for_two'].mean(),
        'Average Rating': data['aggregate_rating'].mean()
    }
    
    # Calculate weaknesses (e.g., high prices, low ratings)
    weaknesses = {
        'High Price Variability': data['average_cost_for_two'].std(),
        'Lowest Rating': data['aggregate_rating'].min()
    }
    
    # Store strengths and weaknesses
    competitor_analysis[region] = {
        'Strengths': strengths,
        'Weaknesses': weaknesses
    }

# Display the results
for region, analysis in competitor_analysis.items():
    print(f"Region: {region}")
    print("Strengths:")
    for strength, value in analysis['Strengths'].items():
        print(f"- {strength}: {value}")
    print("Weaknesses:")
    for weakness, value in analysis['Weaknesses'].items():
        print(f"- {weakness}: {value}")
    print("\n")


#### 06. Market Gap Analysis:
● Identify any gaps in the market that the restaurant chain can capitalize on (e.g., underrepresented cuisines, price ranges).

In [None]:

# Group the data by 'city' or 'region'
region_groups = df.groupby('city')  # Change 'city' to 'region' if the dataset uses 'region' for grouping

# Analyze gaps in the market for each region
market_gaps = {}
for region, data in region_groups:
    # Identify underrepresented cuisines
    underrepresented_cuisines = data['cuisines'].value_counts().nsmallest(3).index.tolist()
    
    # Identify price ranges with limited options
    price_ranges = data['price_range'].value_counts().index.tolist()
    
    # Store market gaps
    market_gaps[region] = {
        'Underrepresented Cuisines': underrepresented_cuisines,
        'Limited Price Ranges': price_ranges
    }

# Display the results
for region, gaps in market_gaps.items():
    print(f"Region: {region}")
    print("Underrepresented Cuisines:")
    print(', '.join(gaps['Underrepresented Cuisines']))
    print("Limited Price Ranges:")
    print(', '.join(map(str, gaps['Limited Price Ranges'])))
    print("\n")
