A brief data analysis of some of the worst reviews Yelp has to offer 

In [None]:
#packages to install 
%conda install -c conda-forge wordcloud
%pip install folium
%pip install langdetect

In [None]:
#Packages to import and why
import nltk                                 #language processing
import folium                               #maps
import logging                              #best practices make best perfects
import numpy as np                          #math stuff
import pandas as pd                         #data processing
import seaborn as sns                       #also data visualization 
import matplotlib.pyplot as plt             #data visualization
from wordcloud import WordCloud, STOPWORDS  #wordclouds and useless words
from langdetect import detect_langs         #get language of reviews bc German is too negative 
from nltk.sentiment.vader import SentimentIntensityAnalyzer  #sentiment analysis of text

nltk.download('vader_lexicon')
%matplotlib inline 

In [None]:
#logging stuff
Log_format = "%(levelname)s %(asctime)s - %(message)s"

logging.basicConfig(
    filename = "logfile.log",
    filemode = "w", 
    format = Log_format,
    level = logging.WARNING)

logger = logging.getLogger()

#Test that logger 
logger.error("Let's GO")

In [None]:
# Read business file 
business = pd.read_csv('https://docs.google.com/spreadsheets/d/12rEscRSc6wu4tcK_n__fIq_AjcQ1IRqjHZ6FhO8REsA/gviz/tq?tqx=out:csv',  index_col=False)


In [None]:
business.head()

In [None]:
#specific columns from business  
business_cols = business[['business_id','name', 'address', 'state', 'postal_code', 'latitude', 'longitude', 'is_open', 'categories']]

In [None]:
#only restaurants 
bus_rests = business_cols[business_cols['categories'].str.contains('Restaurant')] 

In [None]:
len(bus_rests)

In [None]:
#only keep open restarants 
open_rest = bus_rests[bus_rests['is_open']== 1]

In [None]:
len(open_rest)

In [None]:
#drop unneeded columns and reset index 
open_rest = open_rest.drop(columns=['is_open', 'categories']).reset_index(drop=True)

In [None]:
open_rest.head()

Review Stuff

In [None]:
# Read review file 
review = pd.read_csv('/Users/tifanyables/Desktop/Welp/yelp_dataset/yelp_review.csv')

In [None]:
len(review)

In [None]:
#drop unneeded columns and reset index 
review = review.drop(columns=['review_id', 'user_id', 'useful', 'funny', 'cool', 'date'])

In [None]:
rest_ids = set(open_rest['business_id'])

In [None]:
# filter out any place that isn't in rest ids
rest_reviews = review[review['business_id'].isin(rest_ids)]

In [None]:
rest_reviews.head()

In [None]:
rest_reviews['business_id'].nunique()

In [None]:
len(rest_reviews)

In [None]:
#count of star ratings 
star_count = rest_reviews.groupby(['stars']).size().reset_index(name="count")
star_count

In [None]:
# histogram for star ratings of reviews

sns.countplot(data=rest_reviews, x= 'stars', palette="pastel")
plt.title('Star Ratings of Restaurant Reviews',fontsize=18, pad=25.0) 
plt.xlabel("Star Rating", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.ticklabel_format(style='plain', axis='y',useOffset=False)
plt.show()

In [None]:
rest_reviews.set_index('business_id', inplace=True)
open_rest.set_index('business_id', inplace=True)

In [None]:
# filter out only 1 star reviews
one_stars = rest_reviews.loc[rest_reviews['stars'] == 1] 

One star reviews for open restaurants with business info attached 

In [None]:
#combine csvs into mega csv 
one_star_reviews= open_rest.merge(one_stars, on= ["business_id"])

In [None]:
one_star_reviews.head()

Sentiment Analysis 

In [None]:
#sentiment analysis with a little progress bar, as a treat 
analyzer = SentimentIntensityAnalyzer()

hundredth = len(one_star_reviews)//100

compound_scores = []

for i, Review in enumerate(one_star_reviews['text']):
    compound_scores.append(analyzer.polarity_scores(Review)['compound'])
    print((i//hundredth)*'|' + (99-i//hundredth)*' ' + '|', end='\r')

one_star_reviews['compound'] = compound_scores

In [None]:
#add column to df for compound scores
one_star_reviews = one_star_reviews.sort_values(by='compound')

In [None]:
compound_mean = one_star_reviews['compound'].mean()
print(compound_mean)

Language detection 

In [None]:
worst_reviews = one_star_reviews.head(700)

In [None]:
language = [detect_langs(i) for i in worst_reviews.text]
languages = [str(i[0]).split(':')[0] for i in language]
worst_reviews['language'] = languages

In [None]:
# only keep English reviews
worst_reviews = worst_reviews[worst_reviews['language']== 'en']

In [None]:
len(worst_reviews)

Worst 500 reviews by compound score

In [None]:
worst_reviews = worst_reviews.head(500)

In [None]:
worst_reviews.head()

In [None]:
worst_reviews_mean = worst_reviews['compound'].mean()
print(worst_reviews_mean)

Map making

In [None]:
#make a basemap 
USA = [37.090240, -95.712891]
Map = folium.Map(USA, zoom_start=4, tiles= "Stamen Toner")

In [None]:
#add points to that map
for i in range(0, len(worst_reviews)):
    name = worst_reviews.iloc[i]["name"][1:-1]
    msg = f'''
    <center><h2>{name}</h2></center>
    <p>{worst_reviews.iloc[i]["text"]}</p>
    '''
    iframe = folium.IFrame(html=msg, width=400, height=300)
    folium.Marker(
        [worst_reviews.iloc[i] ["latitude"], worst_reviews.iloc[i]["longitude"]],
        tooltip=name,
        popup=folium.Popup(iframe, max_width=400)
    ).add_to(Map)

Map

In [None]:
len(worst_reviews)

In [None]:
worst_mean_english = worst_reviews['compound'].mean()
print(worst_mean_english)

Word clouding 

In [None]:
# join all reviews and get a count of words to start making a wordcloud also lower case them 
review_words= " ".join(worst_reviews['text']).lower()

# how many words?  
print ("There are {} words in the combination of the 500 worst reviews.".format(len(review_words.split(' '))))
# how many stopwords? 
print ("There are {} stopwords.".format(len(STOPWORDS))) #how many uselessly common words?

In [None]:
STOPWORDS.update(["will", "let", "well", "u","us", "another", "go", "got", "came", "come", "put", "going" ]) #stopwords updated 
print ("Now there are {} stopwords.".format(len(STOPWORDS))) 

In [None]:
# make wordcloud of 1 star reviews with stop words removed  
another_word_cloud = WordCloud(stopwords=STOPWORDS, width =2500, height= 2000, max_words=200, random_state=42).generate(review_words)

# show wordcloud 
plt.figure(figsize= (12, 10), dpi=80)
plt.imshow(another_word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()