In [None]:
from collections import defaultdict
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import string, nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import plotly.express as px

In [None]:
restaurant_file = "final_data_restaurant_yelp.gzip"
user_file = "final_data_user_yelp.gzip"
review_file = "final_data_review_yelp-001.gzip"

In [None]:
restaurant_data = pd.read_pickle(restaurant_file)
user_data = pd.read_pickle(user_file)
review_data = pd.read_pickle(review_file)

Fan Analysis

In [None]:
# Get 5 users with the most fans
top_5_users_df = user_data.nlargest(5, "fans")
top_5_users_df

In [None]:
# Review data for top 5 users
top_5_users_df1 = review_data.loc[review_data["user_id"].isin(top_5_users_df["user_id"].to_numpy())]
top_5_users_df1.head(1)

In [None]:
# Global average rating of restaurants
rest_global_avg = restaurant_data["stars"].mean(axis=0)
rest_global_avg

In [None]:
ratings = []
for i, d in top_5_users_df1.iterrows():
    if (d["stars"] < rest_global_avg) and (restaurant_data.loc[restaurant_data["business_id"] == d["business_id"], "stars"].item() < rest_global_avg):
        ratings.append(1)
    elif (d["stars"] > rest_global_avg) and (restaurant_data.loc[restaurant_data["business_id"] == d["business_id"], "stars"].item() > rest_global_avg):
        ratings.append(1)
    else:
        ratings.append(0)
sum(ratings)/len(ratings) # Accuracy

Restaurant analysis for all states

In [None]:
restaurant_data = restaurant_data.reset_index()
restaurant_data = restaurant_data[["business_id", "name", "city", "state",
       "latitude", "longitude", "stars", "review_count", "categories"]]
restaurant_data.head(1)

In [None]:
rest = restaurant_data.groupby("state").size()
rest

In [None]:
lat = defaultdict(list)
lon = defaultdict(list)
ct = defaultdict(int)
for i,d in restaurant_data.iterrows():
    s = d["state"]
    lat[s].append(d["latitude"])
    lon[s].append(d["longitude"])
    ct[s] += 1


In [None]:
temp = []
for s in ct:
    temp.append([sum(lat[s])/len(lat[s]), sum(lon[s])/len(lon[s]), ct[s]])

In [None]:
import folium

from folium.plugins import HeatMap

map_obj = folium.Map(location = [38.27312, -98.5821872], zoom_start = 5)


lats_longs = [
                [38.27312, -98.5821872, 0.5], # Kansas
                [34.395342, -111.763275,0.2], # Arizona
                [37.5726028, -85.1551411, 0.7], # Kentucky
                [32.3293809, -83.1137366,0.9], # Georgia
                [40.0796606, -89.4337288,0.1], # Illinois
            ]


HeatMap(temp).add_to(map_obj)

map_obj

In [None]:
# Plot Pie Chart
def plot_pie_chart(values, labels, title="", path=""):
    fig = plt.subplots(figsize=(8, 8))
    plt.pie(values, labels = labels)
    # plt.title(title, fontsize=20)
    if path:
        plt.savefig(path, bbox_inches = 'tight')
    plt.show()

# Plot Bar Graph
def plot_bar_graph(values, labels, title="", xlabel="", ylabel="", angle=0, path=""):
    fig = plt.subplots(figsize=(8, 5))
    plt.bar(labels, values, color="purple")
    # plt.title(title, fontsize=20)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    plt.xticks(rotation=angle)
    if path:
        plt.savefig(path, bbox_inches = 'tight')
    plt.show()

# Plot Bar Graph
def plot_barh_graph(values, labels, title="", xlabel="", ylabel="", angle=0, path=""):
    fig = plt.subplots(figsize=(8, 5))
    plt.barh(labels, values, color='#7eb54e')
    # plt.title(title, fontsize=20)
    plt.ylabel(xlabel, fontsize=15)
    plt.xlabel(ylabel, fontsize=15)
    # plt.xticks(rotation=angle)
    if path:
        plt.savefig(path, bbox_inches = 'tight')
    plt.show()

In [None]:
restaurants_per_state = defaultdict(set)
rating_per_state = defaultdict(list)
reviews_per_state = defaultdict(int)

for i, d in restaurant_data.iterrows():
    s = d["state"]
    r = d["business_id"]
    rating = d["stars"]
    rc = d["review_count"]
    restaurants_per_state[s].add(r)
    rating_per_state[s].append(rating)
    reviews_per_state[s] += rc

In [None]:
# These states have only 1 restaurant => Combine them together
for s in ["NC","CO","HI","MT","XMS"]:
    restaurants_per_state["Other"].add(i for i in restaurants_per_state.pop(s))

num_rest = [len(restaurants_per_state[s]) for s in restaurants_per_state]

In [None]:
# Pie chart for number of restaurants per state
plot_pie_chart(num_rest, restaurants_per_state.keys(), "Percentage of Restaurants per State")

In [None]:
# Bar graph for number of restaurants per state
plot_bar_graph(num_rest, restaurants_per_state.keys(), "Number of Restaurants per State", "State", "No. of Restaurants")

In [None]:
# These states have only 1 restaurant and very less reviews
for s in ["NC","CO","HI","MT","XMS"]:
    reviews_per_state.pop(s)

In [None]:
# Bar graph for number of reviews per state
# This indicates the popularity of the state
plot_bar_graph(reviews_per_state.values(), reviews_per_state.keys(), "Popularity of States", "State", "No. of Reviews")

In [None]:
# Average number of reviews per state
avg_reviews_per_state = defaultdict(float)
for s in reviews_per_state:
    avg_reviews_per_state[s] = reviews_per_state[s]/len(restaurants_per_state[s])
avg_reviews_per_state

In [None]:
# Popularity based on average number of reviews
plot_bar_graph(avg_reviews_per_state.values(), avg_reviews_per_state.keys(), "Popularity of States", "State", "Avg. No. of Reviews", path="rev_state.jpg")

Since California is the most popular state, we will proceed with CA dataset

Visualization for California State

In [None]:
ca_restaurant_data = restaurant_data.loc[restaurant_data["state"] == "CA"].reset_index().drop("index", axis=1)
ca_restaurant_data.head(1)

In [None]:
# Number of reviews in California
ca_restaurant_data["review_count"].sum()

In [None]:
restaurants_per_city = defaultdict(set)
rating_per_city = defaultdict(list)
reviews_per_city = defaultdict(int)
reviews_per_restaurant = defaultdict(int)
ca_restaurants = []

for i, d in ca_restaurant_data.iterrows():
    c = d["city"]
    r = d["business_id"]
    rating = d["stars"]
    rc = d["review_count"]
    n = d["name"]
    restaurants_per_city[c].add(r)
    rating_per_city[c].append(rating)
    reviews_per_city[c] += rc
    reviews_per_restaurant[n] += rc
    ca_restaurants.append(r)

In [None]:
# 'Santa Barbara', 'Isla Vista', 'Goleta', 'Carpinteria', 'Montecito', 'Summerland', 'Santa  Barbara', 'Truckee'
# Santa Barbara is repeated twice => Combining both
for r in restaurants_per_city["Santa  Barbara"]:
    restaurants_per_city["Santa Barbara"].add(r)
restaurants_per_city.pop("Santa  Barbara")

# Truckee and Summerland have less than 10 restaurants => Combining both into one
for c in ["Summerland", "Truckee"]:
    for r in restaurants_per_city[c]:
        restaurants_per_city["Other"].add(r)
    restaurants_per_city.pop(c)

num_rest_per_city = [len(restaurants_per_city[c]) for c in restaurants_per_city]

In [None]:
# Pie chart for number of restaurants per city in CA
plot_pie_chart(num_rest_per_city, restaurants_per_city.keys(), "Percentage of Restaurants per City in CA", path="res_city.jpg")

In [None]:
# Bar graph for number of restaurants per city in CA
plot_bar_graph(num_rest_per_city, restaurants_per_city.keys(), "Number of Restaurants per City in CA", "City", "No. of Restaurants")

In [None]:
# Santa Barbara is repeated twice => Combining both
reviews_per_city["Santa Barbara"] += reviews_per_city["Santa  Barbara"]
reviews_per_city.pop("Santa  Barbara")

# Truckee and Summerland have less than 10 restaurants => Combining both into one
for c in ["Summerland", "Truckee"]:
    reviews_per_city["Other"] += reviews_per_city[c]
    reviews_per_city.pop(c)

In [None]:
# Bar graph for number of reviews per city in CA
# This indicates the popularity of the city
plot_bar_graph(reviews_per_city.values(), reviews_per_city.keys(), "Popularity of Cities in CA", "City", "No. of Reviews")

In [None]:
# Average number of reviews per city
avg_reviews_per_city = defaultdict(float)
for c in reviews_per_city:
    avg_reviews_per_city[c] = reviews_per_city[c]/len(restaurants_per_city[c])
avg_reviews_per_city

In [None]:
# Popularity based on average number of reviews
plot_bar_graph(avg_reviews_per_city.values(), avg_reviews_per_city.keys(), "Popularity of Cities", "City", "Avg. No. of Reviews")

In [None]:
# Get top 10 popular restaurants in CA based on number of reviews
top_10_restaurants = {k: v for k, v in sorted(reviews_per_restaurant.items(), key=lambda item: item[1])[-10:]}

In [None]:
# Pie chart for number of reviews for top 10 restaurants at CA
plot_pie_chart(top_10_restaurants.values(), top_10_restaurants.keys(), "Popularity of Restaurants in CA")

In [None]:
# Bar graph for number of reviews for top 10 restaurants at CA
plot_barh_graph(top_10_restaurants.values(), list(top_10_restaurants.keys()), "Popularity of Restaurants in CA", "Restaurants", "No. of Reviews", 90, path="rev_rest.jpg")

Top Categories in CA

In [None]:
categories = defaultdict(int)
ratings_per_category = defaultdict(list)
for i, d in ca_restaurant_data.iterrows():
    cats = d["categories"].split(",")
    r = d["stars"]
    for cat in cats:
        cat = cat.strip()
        categories[cat] += 1
        ratings_per_category[cat].append(r)

len(categories)

In [None]:
# Get top 10 most frequent categories
top_10_categories = {k: v for k, v in sorted(categories.items(), key=lambda item: item[1])[-12:-2]}
top_10_categories

In [None]:
# Pie chart for top 10 most frequent categories
plot_pie_chart(top_10_categories.values(), top_10_categories.keys(), "Top 10 Categories")

In [None]:
# Bar graph for top 10 most frequent categories
plot_bar_graph(top_10_categories.values(), top_10_categories.keys(), "Top 10 Categories", "Category", "Frequency", 90, path="pop_cat.jpg")

In [None]:
# Average rating per category
avg_rating_per_category = defaultdict(float)
for cat in top_10_categories:
    avg_rating_per_category[cat] = sum(ratings_per_category[cat])/len(ratings_per_category[cat])
avg_rating_per_category

In [None]:
# Bar graph for the best category based on average rating
plot_bar_graph(avg_rating_per_category.values(), avg_rating_per_category.keys(), "Top 10 Categories", "Category", "Avg. Rating", 90)

Analysis for Review Dataset

In [None]:
review_data = review_data.reset_index()
review_data = review_data[['review_id', 'user_id', 'business_id', 'stars', 'text']]
review_data.head(1)

In [None]:
# Get reviews specific to CA restaurants
ca_review_data = review_data.loc[review_data["business_id"].isin(ca_restaurants)].reset_index().drop("index", axis=1)
ca_review_data.head(1)

Reviews per User

In [None]:
rev_per_user = pd.DataFrame(ca_review_data.groupby("user_id").size()).reset_index()
rev_per_user.columns = ["user_id", "count"]
# rev_per_user.head()

# unique_users = rev_per_user.shape[0]
rev_per_user.groupby("count").size()

Word Analysis

In [None]:
# Get the basic words in English
stop_words = set(stopwords.words("english"))

In [None]:
# Get the word count and ratings for each word for each review in CA review dataset
word_count = defaultdict(int)
ratings_per_word = defaultdict(list)

for i, d in ca_review_data.iterrows():
    text = d["text"]
    rating = d["stars"]
    text = text.translate(str.maketrans('', '', string.punctuation)).replace("\n","").lower().split(" ")
    for word in text:
        if word in stop_words:
            continue
        word_count[word] += 1
        ratings_per_word[word].append(rating)

word_count.pop("")
ratings_per_word.pop("")
len(word_count)

In [None]:
# Get the top 15 frequently used words
top_15_words = {k: v for k, v in sorted(word_count.items(), key=lambda item: item[1])[-15:]}
top_15_words

In [None]:
# Bar graph for the top 15 words based on frequency
plot_bar_graph(top_15_words.values(), top_15_words.keys(), "Top 15 Words", "Word", "Frequency", 90)

In [None]:
# Average rating per word for top 15 words
avg_rating_per_word = defaultdict(float)
for word in top_15_words:
    avg_rating_per_word[word] = sum(ratings_per_word[word])/len(ratings_per_word[word])
avg_rating_per_word

In [None]:
# Bar graph for the top 15 words based on average rating
plot_bar_graph(avg_rating_per_word.values(), avg_rating_per_word.keys(), "Top 15 Words", "Word", "Avg. Rating", 90)

In [None]:
# Average rating for each word
avg_rating_per_word = defaultdict(float)
for word in ratings_per_word:
    avg_rating_per_word[word] = sum(ratings_per_word[word])/len(ratings_per_word[word])
avg_rating_per_word

In [None]:
# Get the top 15 words based on average rating
top_15_words = {k: v for k, v in sorted(avg_rating_per_word.items(), key=lambda item: item[1])[-15:]}
top_15_words

In [None]:
# Get the bottom 15 words based on average rating
bottom_15_words = {k: v for k, v in sorted(avg_rating_per_word.items(), key=lambda item: item[1])[:15]}
bottom_15_words

Word Cloud

In [None]:
# text = " ".join(word_count.keys())

# Creating word_cloud with text as argument in .generate() method
# word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)

wordcloud = WordCloud(background_color = 'white')
wordcloud.generate_from_frequencies(frequencies=word_count)

# Display the generated Word Cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Map Visualization

In [None]:
center_lat = ca_restaurant_data["latitude"].mean(axis=0)
center_lon = ca_restaurant_data["longitude"].mean(axis=0)
center_lat, center_lon

In [None]:
ca_restaurant_data['text'] = ca_restaurant_data['name'] + '' + ca_restaurant_data['city'] + ', ' + ca_restaurant_data['state'] + ', ' + 'Rating: ' + ca_restaurant_data['stars'].astype(str)

fig = go.Figure(data=go.Scattergeo(
        lon = ca_restaurant_data['longitude'],
        lat = ca_restaurant_data['latitude'],
        text = ca_restaurant_data['text'],
        mode = 'markers',
        marker_color = ca_restaurant_data['stars'],
        ))

fig.update_layout(
        title = 'Restaurants in California',
        geo = dict(
            scope='usa',
            projection_type='albers usa',
            showland = True
        )
    )
# fig.write_html("restaurants.html")
# fig.show()
fig.update_layout(
        title_text = "Restaurant ratings in CA<br>(Click legend to toggle traces)",
        # showlegend = True,
        geo = dict(
            projection_scale=70, #this is kind of like zoom
            center=dict(lat=center_lat, lon=center_lon), # this will center on the point
        ))
fig.write_html("restaurants.html")

In [None]:
# geojson = px.data.election_geojson()

# fig = px.choropleth_mapbox(ca_restaurant_data, geojson=geojson,
#                            locations="city", center={"lat": center_lat, "lon": center_lon},
#                            mapbox_style="carto-positron", zoom=9)
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.write_html("restaurants.html")
# # fig.show()