# Level 2

## Task 1 - Restaurant Ratings

### Analyze the distribution of aggregate ratings and determine the most common rating range.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def analyze_rating_distribution(csv_file_path, rating_column_name):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Plot a histogram for the distribution of aggregate ratings
    plt.figure(figsize=(8, 6))
    plt.hist(df[rating_column_name], bins=range(1, 6), align='left', color='skyblue', edgecolor='black', rwidth=0.8)

    plt.title('Distribution of Aggregate Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Number of Restaurants')
    plt.xticks(range(1, 6))  # Set x-axis ticks to match the rating values
    plt.show()

    # To Determine the most common rating range
    most_common_rating = df[rating_column_name].mode().values[0]

    return most_common_rating

csv_file_path = 'Dataset .csv'
rating_column_name = 'Aggregate rating'

# To Analyze the distribution of aggregate ratings and determine the most common rating range
most_common_rating = analyze_rating_distribution(csv_file_path, rating_column_name)

# To Print the result
print(f"The most common rating range is: {most_common_rating}")


### Calculate the average number of votes received by restaurants.

In [None]:
import pandas as pd

def calculate_average_votes(csv_file_path, votes_column_name):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Calculate the average number of votes received by restaurants
    average_votes = df[votes_column_name].mean()

    return average_votes

csv_file_path = 'Dataset .csv'
votes_column_name = 'Votes'

# To Calculate the average number of votes received by restaurants
result = calculate_average_votes(csv_file_path, votes_column_name)

# To Print the result
print(f"The average number of votes received by restaurants is: {result:.2f}")


## Task 2 - Cuisine Combination

### Identify the most common combinations of cuisines in the dataset.

In [None]:
import pandas as pd
from collections import Counter
from itertools import combinations

def most_common_cuisine_combinations(csv_file_path, cuisine_column_name):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Handle NaN values in the cuisine column
    df[cuisine_column_name] = df[cuisine_column_name].fillna('')

    # To Split the comma-separated string into a list of cuisines
    df[cuisine_column_name] = df[cuisine_column_name].str.split(', ')

    # To Count the occurrences of each cuisine combination in the dataset
    cuisine_combinations_counter = Counter()
    for cuisines in df[cuisine_column_name]:
        # To Generate all possible combinations of cuisines for each restaurant
        for combo in combinations(cuisines, 2):  
            cuisine_combinations_counter[combo] += 1

    # Get the most common cuisine combinations
    most_common_combinations = cuisine_combinations_counter.most_common(5)  

    return most_common_combinations

csv_file_path = 'Dataset .csv'
cuisine_column_name = 'Cuisines'

# To Identify the most common combinations of cuisines in the dataset
result = most_common_cuisine_combinations(csv_file_path, cuisine_column_name)

# To Print the result
print("Most Common Cuisine Combinations:")
for combination, count in result:
    print(f"{', '.join(combination)}: {count} occurrences")


### Determine if certain cuisine combinations tends to have higher ratings. 

In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

def average_ratings_by_cuisine_combination(csv_file_path, cuisine_column_name, rating_column_name):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Handle NaN values in the cuisine column
    df[cuisine_column_name] = df[cuisine_column_name].fillna('')

    # To Split the comma-separated string into a list of cuisines
    df[cuisine_column_name] = df[cuisine_column_name].str.split(', ')

    # To Create a dictionary to store ratings for each cuisine combination
    ratings_by_combination = defaultdict(list)

    # To Iterate over each row and store ratings for each combination
    for index, row in df.iterrows():
        cuisines = row[cuisine_column_name]
        rating = row[rating_column_name]

        for combo in combinations(cuisines, 2):
            ratings_by_combination[combo].append(rating)

    # To Calculate the average ratings for each combination
    average_ratings_by_combination = {combo: sum(ratings) / len(ratings) for combo, ratings in ratings_by_combination.items()}

    # To Sort the results by average rating in descending order
    sorted_results = sorted(average_ratings_by_combination.items(), key=lambda x: x[1], reverse=True)

    return sorted_results

csv_file_path = 'Dataset .csv'
cuisine_column_name = 'Cuisines'
rating_column_name = 'Aggregate rating'

# To Calculate the average ratings for each combination of cuisines
result = average_ratings_by_cuisine_combination(csv_file_path, cuisine_column_name, rating_column_name)

# To Print the result
print("Average Ratings by Cuisine Combination (Descending Order):")
for combo, average_rating in result:
    print(f"{', '.join(combo)}: {average_rating:.2f}")


## Task 3 - Geographic Analysis 

### Plot the locations of restaurants on a map using longitude and latitude coordinates. 

In [None]:
import pandas as pd
import folium
from IPython.display import display

def plot_restaurant_locations(csv_file_path, latitude_column_name, longitude_column_name, name_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Create a folium map centered at the first restaurant's coordinates
    map_center = [df[latitude_column_name].iloc[0], df[longitude_column_name].iloc[0]]
    restaurant_map = folium.Map(location=map_center, zoom_start=12)

    # To Add markers for each restaurant
    for index, row in df.iterrows():
        folium.Marker([row[latitude_column_name], row[longitude_column_name]],
                      popup=row[name_column]).add_to(restaurant_map)

    # To Display the map directly in the Jupyter Notebook
    display(restaurant_map)

csv_file_path = 'Dataset .csv'
latitude_column_name = 'Latitude'
longitude_column_name = 'Longitude'
name_column = 'Restaurant Name'  

# To Plot the locations of restaurants on a map and display
plot_restaurant_locations(csv_file_path, latitude_column_name, longitude_column_name, name_column)


### Identify any patterns or clusters of restaurants in specific areas.

In [None]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
from IPython.display import display

def plot_clustered_restaurants(csv_file_path, latitude_column_name, longitude_column_name, n_clusters):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Extract latitude and longitude columns
    locations = df[[latitude_column_name, longitude_column_name]]

    # To Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(locations)
    df['Cluster'] = kmeans.labels_

    # To Create a folium map centered at the mean coordinates
    map_center = [df[latitude_column_name].mean(), df[longitude_column_name].mean()]
    restaurant_map = folium.Map(location=map_center, zoom_start=12)

    # To Add markers for each restaurant with different colors for each cluster
    for index, row in df.iterrows():
        color = f"cluster_{row['Cluster']}"
        folium.Marker([row[latitude_column_name], row[longitude_column_name]],
                      popup=row['Restaurant Name'],
                      icon=folium.Icon(color=color)).add_to(restaurant_map)

    # To Display the map 
    display(restaurant_map)

csv_file_path = 'Dataset .csv'
latitude_column_name = 'Latitude'
longitude_column_name = 'Longitude'
name_column = 'Restaurant Name'  # Replace with the actual column name containing restaurant names
n_clusters = 5  # Set the number of clusters as needed

# To Plot clustered restaurants on a map and display it in the Jupyter Notebook
plot_clustered_restaurants(csv_file_path, latitude_column_name, longitude_column_name, n_clusters)


## Task 4 - Restaurant Chains

### Identify if there are any restaurant chains present in the dataset.

In [None]:
import pandas as pd

def identify_restaurant_chains(csv_file_path, name_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Count the occurrences of each restaurant name
    restaurant_counts = df[name_column].value_counts()

    # To Identify potential restaurant chains (those with multiple locations)
    restaurant_chains = restaurant_counts[restaurant_counts > 1].index.tolist()

    return restaurant_chains

csv_file_path = 'Dataset .csv'
name_column = 'Restaurant Name'  

# To Identify restaurant chains in the dataset
chains = identify_restaurant_chains(csv_file_path, name_column)

# To Print the result
print("Identified Restaurant Chains:")
print(chains)


### Analyze the ratings and popularity of different restaurant chains.

In [None]:
import pandas as pd

def analyze_chain_ratings_popularity(csv_file_path, name_column, rating_column, votes_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Filter out rows with missing ratings or votes
    df = df.dropna(subset=[rating_column, votes_column])

    # To Group by restaurant name and calculate average rating and total votes
    chain_stats = df.groupby(name_column).agg({
        rating_column: 'mean',
        votes_column: 'sum'
    })

    return chain_stats

csv_file_path = 'Dataset .csv'          
name_column = 'Restaurant Name'        
rating_column = 'Aggregate rating'     
votes_column = 'Votes'           

# To Analyze ratings and popularity of different restaurant chains
chain_stats = analyze_chain_ratings_popularity(csv_file_path, name_column, rating_column, votes_column)

# To Print the result
print("Chain Ratings and Popularity:")
print(chain_stats)

