<a href="https://colab.research.google.com/github/Roshan-Velpula/Sentiment-Analysis-Yelp-Business-Reviews/blob/main/Yelp_Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis for Business Reviews in Yelp

In [None]:
import requests

In [None]:
from bs4 import BeautifulSoup

### Web scraping using Beautifulsoup - Yelp.com

In [None]:
r = requests.get('https://www.yelp.com/biz/l-as-du-fallafel-paris?osq=Restaurants')

In [None]:
r.status_code

200

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
divs = soup.findAll(lang="en", class_= "raw__09f24__T4Ezm")

In [None]:
reviews = []
for div in divs:
    reviews.append(div.get_text())

In [None]:
current_page = soup.find('div', class_='pagination-link--current__09f24__vBjKh')

In [None]:
# Scraping all the review pages using the while loop below. Next page url always has a pattern you need to observe

In [None]:
while current_page:
    # get the current page number
    current_page_number = int(current_page.get_text())
    #print(current_page_number) #this line to verify if all the existing review pages are scraped
    
    # construct the url for the next page
    url = f'https://www.yelp.com/biz/l-as-du-fallafel-paris?osq=Restaurants&start={(current_page_number)*10}'
    response = requests.get(url)
    if response.status_code is not 200:
        break
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    review_elements = soup.findAll(lang="en", class_= "raw__09f24__T4Ezm")
    for element in review_elements:
        review_text = element.get_text()
        reviews.append(review_text)
        current_page = soup.find('div', class_='pagination-link--current__09f24__vBjKh')

        
    

  if response.status_code is not 200:


# Analysing Text Data

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
df = pd.DataFrame(np.array(reviews), columns=['reviews'])

In [None]:
df.head()

Unnamed: 0,reviews
0,What!? I haven't left a review about this plac...
1,L'as du Falafel is located right by our apartm...
2,You will always find a long line leading up to...
3,I love this place!I never really paid attentio...
4,"Ate fallafel on a No Diet Club food tour, and ..."


In [None]:
df['review_length'] = df['reviews'].apply(lambda x: len(x.split()))

In [None]:
def avg_words(x):
    words = x.split()
    return sum(len(word) for word in words) / len(words)

In [None]:
df['avg_word_length']= df['reviews'].apply(lambda x: avg_words(x))

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')

In [None]:
df['stop_word_count']= df['reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [None]:
df['stop_word_rate'] = df['stop_word_count'] / df['review_length']

In [None]:
df.sort_values(by='stop_word_rate')

Unnamed: 0,reviews,review_length,avg_word_length,stop_word_count,stop_word_rate
12,"Certainement ils étaitent délicieux, mais les ...",54,5.018519,0,0.000000
58,Good break from French food incredible falafel...,13,5.846154,2,0.153846
330,Fun touristy place to stop. Big portions. Good...,24,4.916667,5,0.208333
142,Toujours imité mais jamais égalé! Easily the b...,24,4.791667,5,0.208333
215,Best falafel sandwich ever !i am totally going...,33,4.272727,8,0.242424
...,...,...,...,...,...
487,My husband and I loved the food here while in ...,29,4.241379,17,0.586207
397,The pita here is out of this world! I also enj...,46,4.043478,27,0.586957
144,At first I was skeptical of it being the o Lu ...,37,3.675676,22,0.594595
208,Growing up in Paris I can confidently say this...,20,3.800000,12,0.600000


In [None]:
df.describe()

Unnamed: 0,review_length,avg_word_length,stop_word_count,stop_word_rate
count,504.0,504.0,504.0,504.0
mean,102.916667,4.420706,48.720238,0.463312
std,78.639582,0.334692,38.431617,0.06382
min,13.0,3.644444,0.0,0.0
25%,49.0,4.197449,22.0,0.434578
50%,79.0,4.374167,38.0,0.472222
75%,139.0,4.617468,65.0,0.5
max,671.0,6.142857,313.0,0.614286


# Cleaning

In [None]:
def remove_stopwords(x):
    words = x.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [None]:
df['stop_words'] = df['reviews'].apply(lambda x: remove_stopwords(x))  #Removed stop words from reviews

In [None]:
df['stop_words']= df['stop_words'].str.replace('[^\w\s]', '')  #Cleaning  punctuation

  df['stop_words']= df['stop_words'].str.replace('[^\w\s]', '')  #Cleaning  punctuation


In [None]:
#Review Recurring words
#Just manually removing the most recurring words that doesnt make sense to our analysis

In [None]:
pd.Series(" ".join(df['stop_words']).lower().split()).value_counts()[:30]

falafel      713
line         335
place        304
paris        296
good         264
food         238
sandwich     236
sauce        229
pita         217
best         212
fallafel     201
one          189
get          189
go           181
long         146
really       145
great        144
eat          142
wait         141
got          139
inside       132
delicious    129
it           129
ive          125
order        122
falafels     120
would        118
eggplant     116
like         115
also         111
dtype: int64

In [None]:
recurring_words = ['falafel','place','line','paris', 'food','one','get','go','sandwich','fallafel','sauce','pita','eat','ive','it','inside','eggplant','las','falafels','shawarma','restaurant','also','would','du','got','back']

In [None]:
df['stop_words'] = df['stop_words'].apply(lambda x: x.lower())

In [None]:
df['clean_reviews']= df['stop_words'].apply(lambda x: " ".join(word for word in x.split() if word not in recurring_words))

In [None]:
pd.Series(" ".join(df['clean_reviews']).lower().split()).value_counts()[:30]

good          264
best          212
long          146
really        145
great         144
wait          141
delicious     129
order         122
like          115
time          110
definitely    109
worth         109
try           107
spicy         107
ever          104
take          101
fresh          98
well           98
amazing        88
ordered        84
people         83
around         80
im             79
service        78
came           76
fast           76
make           73
hot            73
much           73
even           72
dtype: int64

In [None]:
df.head()

Unnamed: 0,reviews,review_length,avg_word_length,stop_word_count,stop_word_rate,stop_words,clean_reviews
0,What!? I haven't left a review about this plac...,381,4.047244,176,0.461942,what left review place mean know ive busy stil...,what left review mean know busy still somewher...
1,L'as du Falafel is located right by our apartm...,258,4.073643,136,0.527132,las du falafel located right apartment decided...,located right apartment decided give try two l...
2,You will always find a long line leading up to...,58,4.362069,29,0.5,always find long line leading familyowned midd...,always find long leading familyowned middle ea...
3,I love this place!I never really paid attentio...,82,4.512195,41,0.5,love placei never really paid attention till p...,love placei never really paid attention till v...
4,"Ate fallafel on a No Diet Club food tour, and ...",119,4.168067,57,0.478992,ate fallafel diet club food tour good went bac...,ate diet club tour good went tour maybe two th...


# Lemmatizing

In [None]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from textblob import Word

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
df['lemmatized']= df['clean_reviews'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [None]:
df.head()

Unnamed: 0,reviews,review_length,avg_word_length,stop_word_count,stop_word_rate,stop_words,clean_reviews,lemmatized
0,What!? I haven't left a review about this plac...,381,4.047244,176,0.461942,what left review place mean know ive busy stil...,what left review mean know busy still somewher...,what left review mean know busy still somewher...
1,L'as du Falafel is located right by our apartm...,258,4.073643,136,0.527132,las du falafel located right apartment decided...,located right apartment decided give try two l...,located right apartment decided give try two l...
2,You will always find a long line leading up to...,58,4.362069,29,0.5,always find long line leading familyowned midd...,always find long leading familyowned middle ea...,always find long leading familyowned middle ea...
3,I love this place!I never really paid attentio...,82,4.512195,41,0.5,love placei never really paid attention till p...,love placei never really paid attention till v...,love placei never really paid attention till v...
4,"Ate fallafel on a No Diet Club food tour, and ...",119,4.168067,57,0.478992,ate fallafel diet club food tour good went bac...,ate diet club tour good went tour maybe two th...,ate diet club tour good went tour maybe two th...


In [None]:
df['clean_review_word_count'] = df['clean_reviews'].apply(lambda x: len(x.split()))

In [None]:
df['clean_review_word_rate'] = df['clean_review_word_count'] / df['review_length']

In [None]:
df.head()

Unnamed: 0,reviews,review_length,avg_word_length,stop_word_count,stop_word_rate,stop_words,clean_reviews,lemmatized,clean_review_word_count,clean_review_word_rate
0,What!? I haven't left a review about this plac...,381,4.047244,176,0.461942,what left review place mean know ive busy stil...,what left review mean know busy still somewher...,what left review mean know busy still somewher...,167,0.43832
1,L'as du Falafel is located right by our apartm...,258,4.073643,136,0.527132,las du falafel located right apartment decided...,located right apartment decided give try two l...,located right apartment decided give try two l...,90,0.348837
2,You will always find a long line leading up to...,58,4.362069,29,0.5,always find long line leading familyowned midd...,always find long leading familyowned middle ea...,always find long leading familyowned middle ea...,24,0.413793
3,I love this place!I never really paid attentio...,82,4.512195,41,0.5,love placei never really paid attention till p...,love placei never really paid attention till v...,love placei never really paid attention till v...,34,0.414634
4,"Ate fallafel on a No Diet Club food tour, and ...",119,4.168067,57,0.478992,ate fallafel diet club food tour good went bac...,ate diet club tour good went tour maybe two th...,ate diet club tour good went tour maybe two th...,48,0.403361


# Sentiment Analysis

In [None]:
from textblob import TextBlob

#### We get a polarity metric and subjectivity metrix. Polarity metric tells us how positive or negative a sentence is ( Ranging from -1 to +1 )

In [None]:
df['polarity'] = df['lemmatized'].apply( lambda x: TextBlob(x).sentiment[0])

In [None]:
df['subjectivity'] = df['lemmatized'].apply( lambda x: TextBlob(x).sentiment[1])

In [None]:
df.describe()

Unnamed: 0,review_length,avg_word_length,stop_word_count,stop_word_rate,clean_review_word_count,clean_review_word_rate,polarity,subjectivity
count,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0
mean,102.916667,4.420706,48.720238,0.463312,44.039683,0.433282,0.303083,0.552315
std,78.639582,0.334692,38.431617,0.06382,34.249964,0.06508,0.153308,0.118315
min,13.0,3.644444,0.0,0.0,6.0,0.257143,-0.288889,0.21875
25%,49.0,4.197449,22.0,0.434578,21.0,0.393892,0.19665,0.472917
50%,79.0,4.374167,38.0,0.472222,34.0,0.428026,0.293498,0.557407
75%,139.0,4.617468,65.0,0.5,59.25,0.470757,0.394792,0.631239
max,671.0,6.142857,313.0,0.614286,311.0,0.907407,0.95,1.0


##### We can see that the average polarity is 0.321 which is +ve, so on a whole this business is receiving +ve reviews, also the subjectivity is high, which means each review is very subjective to the person writing, which is expected when we are dealing with reviews

In [None]:
data_sentiment_analysis = df.drop(['stop_words','clean_reviews','lemmatized'] , axis =1 )

In [None]:
data_sentiment_analysis.head()

Unnamed: 0,reviews,review_length,avg_word_length,stop_word_count,stop_word_rate,clean_review_word_count,clean_review_word_rate,polarity,subjectivity
0,What!? I haven't left a review about this plac...,381,4.047244,176,0.461942,167,0.43832,0.29083,0.571298
1,L'as du Falafel is located right by our apartm...,258,4.073643,136,0.527132,90,0.348837,0.212434,0.515212
2,You will always find a long line leading up to...,58,4.362069,29,0.5,24,0.413793,0.125,0.348148
3,I love this place!I never really paid attentio...,82,4.512195,41,0.5,34,0.414634,0.421104,0.461039
4,"Ate fallafel on a No Diet Club food tour, and ...",119,4.168067,57,0.478992,48,0.403361,0.196429,0.477041


In [None]:
negative_reviews = data_sentiment_analysis[data_sentiment_analysis['polarity'] < 0]

In [None]:
positive_reviews = data_sentiment_analysis[data_sentiment_analysis['polarity'] > 0]