In [1]:
import pandas as pd
import nltk
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/savidu.dias/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/savidu.dias/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def wup(S1, S2):
    return S1.wup_similarity(S2)

In [3]:
def preProcess(sentence):
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    stemmer = SnowballStemmer("english")
    
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in Stopwords]
 
    return words

In [4]:
def word_similarity(w1,w2):
    wn.synsets(w1)
    S1 = wn.synsets(w1)
    S2 = wn.synsets(w2)
    if len(S1) == 0 or len(S2) == 0:
        return 0

    S1 = S1[0]
    S2 = S2[0]
    if S1 and S2:
       similarity = wup(S1, S2)
       if similarity:
          return round(similarity,2)
    return 0

In [5]:
def Similarity(review, keywords):
    words1 = preProcess(review)
    words2 = preProcess(keywords)

    tf = TfidfVectorizer(use_idf=True, token_pattern='(?u)\\b\\w+\\b')
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    Idf = dict(zip(tf.get_feature_names(), tf.idf_))

    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf[w1]
    Sim_score1 /= sum([Idf[w1] for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf[w2]

    Sim_score2 /= sum([Idf[w2] for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2

    return round(Sim, 2)

In [6]:
category_keywords = {
    'Price': 'price',
    'Quality': 'quality taste',
    'Quantity': 'quantity',
    'Location': 'location'
}

In [7]:
price_vector = []
quality_vector = []
quantity_vector = []
location_vector = []

In [8]:
reviews = pd.read_csv("data/Restaurant_Reviews.tsv", sep="\t")
for review in reviews['Review']:
    price_vector.append(Similarity(review, category_keywords['Price']))
    quality_vector.append(Similarity(review, category_keywords['Quality']))
    quantity_vector.append(Similarity(review, category_keywords['Quantity']))
    location_vector.append(Similarity(review, category_keywords['Location']))
    
    
reviews['Price'] = price_vector
reviews['Quality'] = quality_vector
reviews['Quantity'] = quantity_vector
reviews['Location'] = location_vector

reviews    



Unnamed: 0,Review,Liked,Price,Quality,Quantity,Location
0,Wow... Loved this place.,1,0.25,0.29,0.36,0.61
1,Crust is not good.,0,0.46,0.47,0.35,0.59
2,Not tasty and the texture was just nasty.,0,0.36,0.40,0.40,0.28
3,Stopped by during the late May bank holiday of...,1,0.25,0.34,0.44,0.42
4,The selection on the menu was great and so wer...,1,0.67,0.44,0.34,0.35
...,...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,0.38,0.50,0.44,0.36
996,Appetite instantly gone.,0,0.33,0.36,0.35,0.27
997,Overall I was not impressed and would not go b...,0,0.25,0.30,0.39,0.37
998,"The whole experience was underwhelming, and I ...",0,0.26,0.39,0.39,0.33
