In [None]:
from collections import defaultdict
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import string, nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import plotly.express as px
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import bigrams, ngrams, word_tokenize

In [None]:
restaurant_file = "final_data_restaurant_yelp.gzip"
user_file = "final_data_user_yelp.gzip"
review_file = "final_data_review_yelp-001.gzip"

In [None]:
restaurant_data = pd.read_pickle(restaurant_file)
user_data = pd.read_pickle(user_file)
review_data = pd.read_pickle(review_file)

4) Scatter plot for number fans of a user vs average rating

In [None]:
# Average rating given by a user
avg_rating_df = pd.DataFrame(review_data[["user_id", "stars"]].groupby("user_id").mean()).reset_index()
avg_rating_df.head()

In [None]:
# Merge with fans and useful reviews
merged_df = avg_rating_df.merge(user_data[["user_id", "useful", "fans"]], on="user_id")
merged_df.head()

In [None]:
plt.scatter(merged_df["stars"].to_numpy(), merged_df["fans"].to_numpy()/1000)
# plt.title("Fan Analysis for Users")
plt.ylabel("Number of fans")
plt.xlabel("Average Rating")
plt.show()

In [None]:
# Average rating per restaurant
avg_rating_df = pd.DataFrame(review_data[["business_id", "stars"]].groupby("business_id").mean()).reset_index()
avg_rating_df.head()

In [None]:
# Get no. of fans
merged_df = pd.merge(avg_rating_df, review_data[["business_id","user_id"]], on="business_id")
merged_df = pd.merge(merged_df, user_data[["user_id", "fans"]], on="user_id")
merged_df = merged_df.groupby("business_id").agg({"stars":"first", "fans":"sum"}).reset_index()
merged_df.head()

In [None]:
plt.scatter(merged_df["fans"].to_numpy(), merged_df["stars"].to_numpy())
# plt.title("Fan Analysis for Users", fontsize=20)
plt.xlabel("Number of fans", fontsize=15)
plt.ylabel("Average Rating", fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.savefig("fan_analysis.jpg")
plt.show()

5) Scatter plot for number of userful reviews of a user vs average rating

In [None]:
# Get useful reviews
merged_df = pd.merge(avg_rating_df, review_data[["business_id","user_id"]], on="business_id")
merged_df = pd.merge(merged_df, user_data[["user_id", "useful"]], on="user_id")
merged_df = merged_df.groupby("business_id").agg({"stars":"first", "useful":"sum"}).reset_index()
merged_df.head()

In [None]:
plt.scatter(merged_df["useful"].to_numpy(), merged_df["stars"].to_numpy())
# plt.title("Useful Reviews by Users", fontsize=20)
plt.xlabel("Number of Useful Reviews", fontsize=15)
plt.ylabel("Average Rating", fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.savefig("useful_reviews.jpg")
plt.show()

2) Table for data stats - All states, Just CA
	Number of Reviews
	Number of users
	Number of Business
	Number of categories

In [None]:
restaurant_data.shape, user_data.shape, review_data.shape

In [None]:
categories_df = pd.DataFrame(restaurant_data["categories"], columns = ["categories"])
categories_df["categories"] = categories_df["categories"].str.split(", ")
categories_df = categories_df.explode(["categories"])
len(categories_df["categories"].unique())

In [None]:
ca_rest = restaurant_data.loc[restaurant_data["state"] == "CA"]
ca_rest.shape

In [None]:
merged_df = ca_rest.merge(review_data, on="business_id")
merged_df.head()

In [None]:
merged_df.shape

In [None]:
len(merged_df["user_id"].unique())

In [None]:
# categories_df = pd.DataFrame(merged_df["categories"], columns = ["categories"])
categories_df["categories"] = merged_df["categories"].str.split(", ")
categories_df = categories_df.explode(["categories"])
len(categories_df["categories"].unique())

1) Word cluster/unigram/bigram/trigram - Top positive, top negative

In [None]:
# Perform sentiment analysis using VADER
sid = SentimentIntensityAnalyzer()
merged_df['compound'] = merged_df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Classify as positive or negative based on compound score
merged_df['sentiment'] = merged_df['compound'].apply(lambda x: 'positive' if x >= 0 else 'negative')

# Filter reviews for positive and negative sentiments
positive_reviews = ' '.join(merged_df[merged_df['sentiment'] == 'positive']['text'])
negative_reviews = ' '.join(merged_df[merged_df['sentiment'] == 'negative']['text'])

# Generate WordCloud for positive reviews
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('WordCloud for Positive Reviews', fontsize=20)
plt.axis('off')
plt.savefig("pos_uni.jpg")
plt.show()

In [None]:
# Generate WordCloud for negative reviews
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_reviews)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('WordCloud for Negative Reviews', fontsize=20)
plt.axis('off')
plt.savefig("neg_uni.jpg")
plt.show()

In [None]:
import re
merged_df['cleaned_text'] = merged_df['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', ' ', x))

positive_reviews = ' '.join(merged_df[merged_df['sentiment'] == 'positive']['cleaned_text'])
negative_reviews = ' '.join(merged_df[merged_df['sentiment'] == 'negative']['cleaned_text'])

In [None]:
# Tokenize and extract bigrams for positive and negative reviews
positive_reviews_bigrams = list(bigrams(word_tokenize(positive_reviews.split())))
negative_reviews_bigrams = list(bigrams(word_tokenize(negative_reviews.split())))

# Generate WordCloud for positive bigrams
wordcloud_positive_bigrams = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(positive_reviews_bigrams))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive_bigrams, interpolation='bilinear')
plt.title('Bigram WordCloud for Positive Reviews')
plt.axis('off')
plt.savefig("pos_bi.jpg")
plt.show()

In [None]:
# Generate WordCloud for negative bigrams
wordcloud_negative_bigrams = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(negative_reviews_bigrams))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative_bigrams, interpolation='bilinear')
plt.title('Bigram WordCloud for Negative Reviews')
plt.axis('off')
plt.savefig("neg_bi.jpg")
plt.show()

In [None]:
# Tokenize and extract trigrams for positive and negative reviews
positive_reviews_trigrams = list(ngrams(' '.join(merged_df[merged_df['sentiment'] == 'positive']['text']).split(), 3))
negative_reviews_trigrams = list(ngrams(' '.join(merged_df[merged_df['sentiment'] == 'negative']['text']).split(), 3))

# Generate WordCloud for positive trigrams
wordcloud_positive_trigrams = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(positive_reviews_trigrams))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive_trigrams, interpolation='bilinear')
plt.title('Trigram WordCloud for Positive Reviews')
plt.axis('off')
plt.savefig("pos_tri.jpg")
plt.show()

In [None]:
# Generate WordCloud for negative trigrams
wordcloud_negative_trigrams = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(negative_reviews_trigrams))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative_trigrams, interpolation='bilinear')
plt.title('Trigram WordCloud for Negative Reviews')
plt.axis('off')
plt.savefig("neg_tri.jpg")
plt.show()