## Yelp Tips Extraction and Sentiment Analysis


## Motivation
- Restaurant goers often face the problem of picking a restaurant. They often go through multiple reviews to come to a decision. 
- The restaurant owners too must go through each and every long review to understand what the customers are thinking about their restaurants.
- Our model extracts relevant tips from a large set of restaurant reviews and perform sentiment analysis on those tips.

In [None]:
'''
This scraper generates a list of files containing restaurant reviews. This scraper extracts reviews close to 10K 
from yelp.com
'''
import os
import time
from bs4 import BeautifulSoup



cuisine = input("Enter the cuisine: ")
path = "C:/Users/prath/Desktop/"+cuisine+"/"

for filename in os.listdir(path):
    restaurant_name = filename
    restaurant_name = restaurant_name.split("_review",1)[0]
    print(restaurant_name)

    soup = None
    for i in range(5):  # try 5 times
        try:
            soup = BeautifulSoup(open(path+filename, encoding="utf-8"),"html.parser")
            break  # we got the file, break the loop
        except Exception as e:  # browser.open() threw an exception, the attempt to get the response failed
            print('failed attempt', i)
            time.sleep(2)  # wait 2 secs

    if not soup: continue  # couldnt get the page, ignore

    reviews = soup.findAll('div', {'class': 'review-content'})  # get all the review divs
    for review in reviews:
        text = review.find('p',{'lang': 'en'})
        if text:
            with open(restaurant_name+".txt",'a+') as f:
                f.write(text.text + '\n')

In [None]:
'''
This is a sample code for generating a list of Mexican Restaurants in New York (Contains URL identifiers)
'''

import unicodedata
from lxml import etree
import requests
from lxml import html
import numpy as np
import io
import numpy as np

root = etree.Element('html')
root.tag
etree.SubElement(root,'head')
etree.SubElement(root,'body')
print(etree.tostring(root))

s=np.arange(0,1000,10)

f = open("List_NY.txt","w")

title=[]
for j in s:    
    page=requests.get('https://www.yelp.com/search?find_desc=Mexican+Food&find_loc=New+York,+NY&start='+str(j))
    html_content = html.fromstring(page.content)
    for i in range(3,11):
        x=[]
        x=html_content.xpath('//*[@id="super-container"]/div/div[2]/div[1]/div/div[4]/ul[2]/li['+str(i)+']/div/div[1]/div[1]/div/div[2]/h3/span/a/@href')
        y=str(x).replace(',','Mexican').replace("'","").replace('[','').replace(']','')
        f.write("%s\n"%y)
f.close()

In [None]:
'''
This scraper takes the list of restaurants as input and scrapes 'review highlights' of all the restaurants
'''

from lxml import etree
import requests
from lxml import html
import string
from nltk.tokenize.moses import MosesDetokenizer
import nltk


root = etree.Element('html')
root.tag
etree.SubElement(root,'head')
etree.SubElement(root,'body')
print(etree.tostring(root))

path='List_NY.txt'    #Reading the file from which the restaurants names are taken to search in Yelp and Scrape the reviews
fin=open(path)
f = open("List_NY_highlights.txt","w")
for line in fin:                                              
                words = line.lower().strip()                     
                restraunt_name=words
                url='https://www.yelp.com'+restraunt_name
                page= requests.get(url)
                html_content = html.fromstring(page.content)
                for i in range(1,4):
                    x=[]
                    y=html_content.xpath('//*[@id="super-container"]/div/div/div[1]/div[1]/div[1]/ul/li['+str(i)+']/div[2]/p/a[1]/text()')
                    z=html_content.xpath('//*[@id="super-container"]/div/div/div[1]/div[1]/div[1]/ul/li['+str(i)+']/div[2]/p/text()')
                    
                    z=str(z).replace('[','').replace(']','')
                    z=str(z).replace("', '\n    '","")
                    z=z.replace("'", "")
                    
                    x = nltk.word_tokenize(z)
                    x = [''.join(c for c in s if c not in string.punctuation) for s in x]
                    x = [s for s in x if s]
                    
                    x1= list(filter(('n').__ne__, x))    
                    size = len(x1)
                    x1 = x1[1:size-2]
                    detokenizer = MosesDetokenizer()
                    list1=detokenizer.detokenize(x1, return_str=True)
                    list1.strip()
                    list1=list1[1:-1]
                    f.write("%s\n"%list1)
f.close()


In [None]:
'''
The script includes the following pre-processing steps for text:
- Sentence Splitting
- Term Tokenization
- Ngrams
- POS tagging

The run function includes all bigrams of the form: <ADVERB> <ADJECTIVE>
'''
import nltk
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
from nltk import load

def getAdvAdjTwograms(terms,adj,adv): # return all the 'adv adj' twograms
    result=[]
    twograms = ngrams(terms,2)  
    for tg in twograms:  
        if tg[0] in adv and tg[1] in adj: # if the 2gram is a an adverb followed by an adjective
            result.append(tg)
    return result

def getAdjOnegrams(terms,adj): # return all the 'adj' unigrams
    resultadj=[]
    onegrams = ngrams(terms,1) #compute unigrams
    for og in onegrams:  
        if og[0] in adj: # if the unigram is a an adjective
            resultadj.append(og[0])
    return resultadj

def getPOSterms(terms,POStags,tagger): # return all the terms that belong to a specific POS type
    tagged_terms=tagger.tag(terms) # do POS tagging on the tokenized sentence
    POSterms={}
    for tag in POStags:POSterms[tag]=set()
    for pair in tagged_terms:     #for each tagged term
        for tag in POStags:     # for each POS tag 
            if pair[1].startswith(tag): POSterms[tag].add(pair[0])
    return POSterms

def run1(fpath):
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)
    f=open(fpath)
    text=f.read().strip()
    f.close()
    sentences=sent_tokenize(text)
    print ('NUMBER OF SENTENCES: ',len(sentences))
    adjAfterAdv=[]
    for sentence in sentences:
        terms = nltk.word_tokenize(sentence)   
        POStags=['JJ','RB'] # POS tags of interest 
        POSterms=getPOSterms(terms,POStags,tagger)
        adjectives=POSterms['JJ']
        adverbs=POSterms['RB']
        adjAfterAdv+=getAdvAdjTwograms(terms, adjectives, adverbs)
        newadjAfterAdv=set(adjAfterAdv)
    return newadjAfterAdv

def run2(fpath):
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)
    f=open(fpath)
    text=f.read().strip()
    f.close()
    sentences=sent_tokenize(text)
    print ('NUMBER OF SENTENCES: ',len(sentences))
    adjAfterAdv=[]
    for sentence in sentences:
        terms = nltk.word_tokenize(sentence)   
        POStags=['JJ','RB'] # POS tags of interest
        POSterms=getPOSterms(terms,POStags,tagger)
        adjectives=POSterms['JJ']
        adjAfterAdv+=getAdjOnegrams(terms, adjectives)
        newadjAfterAdv=set(adjAfterAdv)	
    return newadjAfterAdv

if __name__=='__main__':
    run1=run1('List_NY_highlights.txt')
    run2=run2('List_NY_highlights.txt')
    f = open("List_lexicon_of_expressions.txt","w")
    for elem in run1:
        for elemlist in elem:
            f.write(str(elemlist)+" ")
        f.write("\n")
    for elem in run2:
        f.write("\n"+str(elem))
    f.close()

In [None]:
'''
This step is used to extract tips from the main review file using the lexicon prepared in the above step
'''
import re


f=open("tips.txt",'w')
def loadLexicon(fname):
    newLex=set()
    lex_conn=open(fname)
    #add every word in the file to the set
    for sentence in lex_conn:
        newLex.add(sentence.strip())# remember to strip to remove the lin-change character
    lex_conn.close()
    return newLex

searchfile = open("Italian/rosies-new-york.txt")            #Restaurant review file
file_lex=loadLexicon('List_lexicon_of_expressions.txt')
text=searchfile.read().lower()
sentences = text.split('.')

for sent in sentences:
    #print(sent)
    sent=sent.lower().strip()
    sent=re.sub("[^a-zA-Z0-9|?|.|,|!|-|:|;|&|@|_|/|>|<|#|$|']",' ',sent)
    words=sent.split(' ')
    unique=set()
    for word in words:
        if word in file_lex:
            unique.add(sent)
    for i in unique:   
        f.write(str(i)+" "+"\n")
f.close()
searchfile.close()

In [None]:
'''
To further filter the review for users, we are using this code to create list of relavent noun words. This step will allow
us to filter out only those tips that are relevant during the next step
'''

import nltk
from nltk.util import ngrams 
from nltk.tokenize import sent_tokenize
from nltk import load
from textblob import TextBlob, Word
from collections import Counter
import csv


f=open('tips.txt')
fn=open('nouns.txt','w')
text=f.read().strip()
f.close()
blob = TextBlob(text)

nouns = list()
for word, tag in blob.tags:
    if tag == 'NN':
        nouns.append(word.lemmatize())
nouns=[x for x in nouns if x != 'i']
fn.write(str(nouns))
freqNouns = Counter(nouns)
with open('nouns.csv','w') as csvfile:
    writer=csv.writer(csvfile,dialect='excel')
    for noun in nouns:
        if noun:
            writer.writerow([noun])

In [None]:
'''
This step uses word2vec model to find similar or relevant words to our word of interest. This sample code gives out a list 
of words relevant to the word 'food'
'''

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
from sklearn.preprocessing import StandardScaler
from pylab import rcParams
import csv


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
f=open("relevant.txt",'w')

# train word2vec model on the list
model0 = Word2Vec(LineSentence("nouns.csv"), size=10, window=2, min_count=1, workers=4)
list1=model0.most_similar('food', topn=10)
print((list1[0]))
with open("relevant.csv","w") as result:
    wr = csv.writer(result,dialect="excel")
    for each in list1:
        if each:
            wr.writerow(each)

In [None]:
'''
This step generates a list of relevant tips based on the list of relevant words which will be used for recommendation 
for the customers and restaurant owner 
'''

import unicodedata
from lxml import etree
import requests
from lxml import html
import numpy as np
import io
import pandas as pd
import re
import nltk


g=open("final_tips.txt",'w')
file = open('nouns.csv')
column_1st = pd.read_csv(file, sep=',',header=0,usecols=[0])
listw=column_1st['words'].values.tolist()
content = []
with open("tips.txt") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]
unique={}                             
for sent in content:
    for word in listw:
        if word in sent:
            unique[word] = sent
                      
for i in unique:
    g.write(str(i)+": "+unique[i]+"\n")  
g.close()

In [None]:
'''
This step performs sentiment analysis on the final tips
'''
import nltk
from nltk.corpus import stopwords

def tokenize(str):
        text = str.lower()
        pattern = r'[a-z]+[a-z\-\.]*'
        tokens = nltk.regexp_tokenize(text, pattern) 
        return tokens
    
class Text_Analyzer(object):
    def _init_(self, input_file):
        self.input = input_file

    def sentiment_analysis(self):
        with open(self.input) as f:
            line=f.readlines()
            line = [line.rstrip('\n') for line in line] 
            for tip in line:
                print(tip)
                tokens = tokenize(str(tip))
                stop_words = stopwords.words('english')
                filtered_tokens=[token for token in tokens if token not in stop_words] 
                
                with open("positive-words.txt", 'r') as  value:
                    positive_word = [line.strip() for line in value]
                positive_tokens = [token for token in filtered_tokens if token in positive_word]
                with open("negative-words.txt", 'r') as  value:
                    negative_word = [line.strip() for line in value] 
                negative_tokens = [token for token in filtered_tokens if token in negative_word]
           
                if len(positive_tokens)>len(negative_tokens):
                    response = "positive"
                if len(positive_tokens)<len(negative_tokens):
                     response="negative"
                if len(positive_tokens)==len(negative_tokens):
                     response="neutral"    
                print("The tip is:",str(response))
            return response

if __name__ == "__main__":  
    response = Text_Analyzer("final_tips.txt")
    sentiment= response.sentiment_analysis()

## Final Thoughts
- This model would efficiently assist restaurants in keeping up with good work they are doing, and improve upon the negatives. 
- The restaurant owners would specifically know what to improve upon rather than generic positive/negative reviews.
- The customers too would be highly benefitted by the positive tips fed to the website, thus saving up on time of going   through the entire reviews and looking for what is good or bad in the restaurant. 