In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer

from scipy.stats import pearsonr
from numpy.linalg import norm
import numpy as np

stemmer = WordNetLemmatizer()

In [2]:
def stem_list(words):
    stemmed_words = []
    for word in words:
        word = word.lower()
        stemmed_words.append(stemmer.lemmatize(word))
    return stemmed_words

In [3]:
def get_stemmed_document(document):
    tokens = document.split()
    stemmed_words = stem_list(tokens)
    return stemmed_words

In [4]:
def calculate_pearson_correlation(annotations, category_vector):    
    corr, _ = pearsonr(annotations, category_vector)
    return corr

In [5]:
def calculate_cosine_similarity(annotations, category_vector):
    cosine = np.dot(annotations, category_vector) / (norm(annotations) * norm(category_vector))
    return cosine

In [6]:
price_keywords = ['price', 'cost', 'money', 'overpriced', 'cheap', 'cash', 'inexpensive', 'deal', 'value']
quality_keywords = ['taste', 'fresh', 'yum', 'amazing', 'beautiful', 'good', 'terrible', 'bad', 'disgust', 'generic', 
                    'tasty', 'refreshing', 'delicious', 'meh', 'appalling', 'bland', 'subpar', 'quality']
quantity_keywords = ['combo', 'portion', 'big', 'small', 'quantity', 'amount', 'number', 'total', 'size']
location_keywords = ['place', 'city' 'visit', 'view', 'town', 'vegas', 'location', 'spot', 'area', 'neighborhood']

In [7]:
price_keywords = stem_list(price_keywords)
quality_keywords = stem_list(quality_keywords)
quantity_keywords = stem_list(quantity_keywords)
location_keywords = stem_list(location_keywords)

In [8]:
def generate_category_vector(reviews):
    price_vector = []
    quality_vector = []
    quantity_vector = []
    location_vector = []
    
    for review in reviews['Review']:
        stemmed_review = get_stemmed_document(review)

        if any(word in price_keywords for word in stemmed_review):
            price_vector.append(1)
        else:
            price_vector.append(0)

        if any(word in quality_keywords for word in stemmed_review):
            quality_vector.append(1)
        else:
            quality_vector.append(0)

        if any(word in quantity_keywords for word in stemmed_review):
            quantity_vector.append(1)
        else:
            quantity_vector.append(0)

        if any(word in location_keywords for word in stemmed_review):
            location_vector.append(1)
        else:
            location_vector.append(0)
            
    reviews['Price'] = price_vector
    reviews['Quality'] = quality_vector
    reviews['Quantity'] = quantity_vector
    reviews['Location'] = location_vector
    return reviews

In [9]:
def print_similarity_values(target_vector, annotations, reviews):
    print("Pearson correlation between " + target_vector +" and price: ", 
          calculate_pearson_correlation(annotations, reviews['Price']))
    print("Pearson correlation between " + target_vector +" and quality: ", 
          calculate_pearson_correlation(annotations, reviews['Quality']))
    print("Pearson correlation between " + target_vector +" and quantity: ", 
          calculate_pearson_correlation(annotations, reviews['Quantity']))
    print("Pearson correlation between " + target_vector +" and location: ", 
          calculate_pearson_correlation(annotations, reviews['Location']))
    print()
    
    print("Cosine similarity between " + target_vector + " and price: ", 
          calculate_cosine_similarity(annotations, reviews['Price']))
    print("Cosine similarity between " + target_vector + " and quality: ", 
          calculate_cosine_similarity(annotations, reviews['Quality']))
    print("Cosine similarity between " + target_vector + " and quantity: ", 
          calculate_cosine_similarity(annotations, reviews['Quantity']))
    print("Cosine similarity between " + target_vector + " and location: ", 
          calculate_cosine_similarity(annotations, reviews['Location']))

Obtaining similarity values between the labelled sentiment and the category vectors

In [10]:
reviews = pd.read_csv("data/Restaurant_Reviews.tsv", sep="\t")
reviews_category_vectors = generate_category_vector(reviews)

In [11]:
print_similarity_values('labelled sentiment', reviews_category_vectors['Liked'], reviews_category_vectors)

Pearson correlation between labelled sentiment and price:  -0.050271730430030276
Pearson correlation between labelled sentiment and quality:  0.08920515501750784
Pearson correlation between labelled sentiment and quantity:  8.61940727125976e-18
Pearson correlation between labelled sentiment and location:  0.023437864934919773

Cosine similarity between labelled sentiment and price:  0.07893522173763264
Cosine similarity between labelled sentiment and quality:  0.31378581622109447
Cosine similarity between labelled sentiment and quantity:  0.10954451150103323
Cosine similarity between labelled sentiment and location:  0.23821728473701217


Obtaining similarity values between the Textblob classification and the category vectors

In [12]:
reviews = pd.read_csv("data/derived/restaurant_reviews_textblob.csv", sep="\t")
reviews_category_vectors = generate_category_vector(reviews)
print_similarity_values('textblob classification', reviews_category_vectors['Category'], reviews_category_vectors)

Pearson correlation between textblob classification and price:  -0.05443291387738019
Pearson correlation between textblob classification and quality:  0.02460358379651839
Pearson correlation between textblob classification and quantity:  -0.012834877271702712
Pearson correlation between textblob classification and location:  0.0022315562124795114

Cosine similarity between textblob classification and price:  0.09428672450603165
Cosine similarity between textblob classification and quality:  0.29867799427598285
Cosine similarity between textblob classification and quantity:  0.11449285668638783
Cosine similarity between textblob classification and location:  0.24964913686647244


Obtaining similarity values between the Sentistrength classification and the category vectors

In [13]:
reviews = pd.read_csv("data/derived/restaurant_reviews_sentistrength.csv", sep="\t")
reviews_category_vectors = generate_category_vector(reviews)
print_similarity_values('sentistrength classification', reviews_category_vectors['Category'], reviews_category_vectors)

Pearson correlation between sentistrength classification and price:  0.003543813582588844
Pearson correlation between sentistrength classification and quality:  0.023224245660805283
Pearson correlation between sentistrength classification and quantity:  0.013434616188100964
Pearson correlation between sentistrength classification and location:  0.05764142831457746

Cosine similarity between sentistrength classification and price:  0.14569989241669287
Cosine similarity between sentistrength classification and quality:  0.3320004802106672
Cosine similarity between sentistrength classification and quantity:  0.1444278629379538
Cosine similarity between sentistrength classification and location:  0.30577886499345286
