# Level 3 

## Task 1 - Restaurant Reviews

### Analyze the text reviews to identify the most common positive and negative keywords.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter

# To Download the NLTK punkt and stopwords resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

def analyze_reviews_keywords(csv_file_path, review_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Tokenize and clean the reviews
    stop_words = set(stopwords.words('english'))
    reviews = ' '.join(df[review_column].astype(str))
    words = word_tokenize(reviews.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # To Sentiment analysis to identify positive and negative reviews
    sid = SentimentIntensityAnalyzer()
    df['Sentiment'] = df[review_column].apply(lambda x: sid.polarity_scores(str(x))['compound'])

    # To Get the most common positive and negative keywords
    positive_reviews = ' '.join(df[df['Sentiment'] > 0][review_column].astype(str))
    negative_reviews = ' '.join(df[df['Sentiment'] < 0][review_column].astype(str))

    positive_words = word_tokenize(positive_reviews.lower())
    negative_words = word_tokenize(negative_reviews.lower())

    positive_keywords = Counter(positive_words).most_common(10)
    negative_keywords = Counter(negative_words).most_common(10)

    return positive_keywords, negative_keywords

csv_file_path = 'Dataset .csv'
review_column = 'Rating text'  

# To Analyze text reviews to identify the most common positive and negative keywords
positive_keywords, negative_keywords = analyze_reviews_keywords(csv_file_path, review_column)

# To Print the results
print("Most Common Positive Keywords:")
print(positive_keywords)

print("\nMost Common Negative Keywords:")
print(negative_keywords)


### Calculate the average length of reviews and explore if there is a relationship between review length and rating.

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# To Download the NLTK punkt, stopwords, and vader_lexicon resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

def analyze_review_length(csv_file_path, review_column, rating_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Tokenize reviews and calculate their lengths
    df['ReviewLength'] = df[review_column].apply(lambda x: len(nltk.word_tokenize(str(x))))

    # To Calculate the average length of reviews
    average_review_length = df['ReviewLength'].mean()

    # to Explore the relationship between review length and rating
    rating_vs_length = df.groupby(rating_column)['ReviewLength'].mean()

    return average_review_length, rating_vs_length

csv_file_path = 'Dataset .csv'
review_column = 'Rating text'  # Replace with the actual column name containing text reviews
rating_column = 'Aggregate rating'  # Replace with the actual column name containing ratings

# To Analyze review length and explore the relationship with rating
average_length, rating_length_relationship = analyze_review_length(csv_file_path, review_column, rating_column)

# To Print the results
print("Average Length of Reviews:", average_length)
print("\nRelationship between Review Length and Rating:")
print(rating_length_relationship)


## Task 2 - Votes Analysis

### Identify the restaurants with the highest and lowest number of votes.

In [None]:
import pandas as pd

def identify_high_low_votes_restaurants(csv_file_path, name_column, votes_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Identify the restaurant with the highest number of votes
    highest_votes_restaurant = df.loc[df[votes_column].idxmax()][name_column]

    # To Identify the restaurant with the lowest number of votes
    lowest_votes_restaurant = df.loc[df[votes_column].idxmin()][name_column]

    return highest_votes_restaurant, lowest_votes_restaurant

csv_file_path = 'Dataset .csv'
name_column = 'Restaurant Name'   
votes_column = 'Votes'  

# To Identify restaurants with the highest and lowest number of votes
highest_votes, lowest_votes = identify_high_low_votes_restaurants(csv_file_path, name_column, votes_column)

# To Print the results
print("Restaurant with the Highest Number of Votes:", highest_votes)
print("Restaurant with the Lowest Number of Votes:", lowest_votes)


### Analyze if there is a correlation between the number of votes and the rating of a restaurant.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_votes_rating_correlation(csv_file_path, votes_column, rating_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Drop rows with missing values in votes or rating columns
    df = df.dropna(subset=[votes_column, rating_column])

    # To Calculate the Pearson correlation coefficient
    correlation_coefficient = df[votes_column].corr(df[rating_column])

    # To Visualize the relationship using a scatter plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df[votes_column], y=df[rating_column])
    plt.title('Number of Votes vs Rating')
    plt.xlabel('Number of Votes')
    plt.ylabel('Rating')
    plt.show()

    return correlation_coefficient

csv_file_path = 'Dataset .csv'
votes_column = 'Votes'    
rating_column = 'Aggregate rating'  

# To Analyze the correlation between the number of votes and rating
correlation_coefficient = analyze_votes_rating_correlation(csv_file_path, votes_column, rating_column)

# To Print the correlation coefficient
print("Correlation Coefficient between Votes and Rating:", correlation_coefficient)


## Task 3 - Price Range vs Online Delivery and Table Booking

### Analyze if there is a relationship between the price range and the availability of online delivery and table booking.

In [None]:
import pandas as pd

def analyze_price_range_relationship(csv_file_path, price_column, online_delivery_column, table_booking_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Group by price range and calculate the percentage of restaurants with online delivery and table booking
    price_range_stats = df.groupby(price_column).agg({
        online_delivery_column: lambda x: (x == 'Yes').sum() / len(x) * 100,
        table_booking_column: lambda x: (x == 'Yes').sum() / len(x) * 100
    })

    return price_range_stats

# Example usage
csv_file_path = 'Dataset .csv'
price_column = 'Price range'                   
online_delivery_column = 'Has Online delivery'  
table_booking_column = 'Has Table booking'      

# To Analyze the relationship between price range, online delivery, and table booking
price_range_stats = analyze_price_range_relationship(csv_file_path, price_column, online_delivery_column, table_booking_column)

# To Print the results
print("Percentage of Restaurants with Online Delivery and Table Booking by Price Range:")
print(price_range_stats)


### Determine if higher-priced restaurants are more likely to offer these services.

In [None]:
import pandas as pd

def analyze_higher_priced_restaurants(csv_file_path, price_column, online_delivery_column, table_booking_column):
    # To Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # To Convert the price column to numeric values, excluding non-numeric entries
    df[price_column] = pd.to_numeric(df[price_column], errors='coerce')

    # To Drop rows with missing or non-numeric values in the price column
    df = df.dropna(subset=[price_column])

    # To Create a new column indicating whether the restaurant is higher-priced
    df['IsHigherPriced'] = df[price_column] > df[price_column].median()

    # To Convert columns related to online delivery and table booking to boolean
    df[online_delivery_column] = df[online_delivery_column] == 'Yes'
    df[table_booking_column] = df[table_booking_column] == 'Yes'

    # To Calculate the average availability of online delivery and table booking for higher and lower-priced restaurants
    average_availability = df.groupby('IsHigherPriced').agg({
        online_delivery_column: 'mean',
        table_booking_column: 'mean'
    })

    return average_availability

csv_file_path = 'Dataset .csv'
price_column = 'Price range'                    
online_delivery_column = 'Has Online delivery'  
table_booking_column = 'Has Table booking'      

# To Analyze if higher-priced restaurants are more likely to offer online delivery and table booking
average_availability = analyze_higher_priced_restaurants(csv_file_path, price_column, online_delivery_column, table_booking_column)

# To Print the results
print("Average Availability of Online Delivery and Table Booking:")
print(average_availability)
