# Restaurant Reviews - Sentiment Analysis and Machine Learning

*Ville Kylmämaa, Joona Holappa, Miiro Kuosmanen, Anssi Valjakka*

In [None]:
# Python built-in modules
import sys
import os.path
import json
import subprocess
import shlex
from collections import Counter

# %pip install pandas
import pandas as pd

# %pip install numpy
import numpy as np

# %pip install nltk
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# %pip install scipy
from scipy.spatial.distance import cosine

# %pip install matplotlib
import matplotlib.pyplot as plt

# %pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# %pip install empath
from empath import Empath

# %pip install wordcloud
from wordcloud import WordCloud


In [None]:

# Read restaurant reviews to pandas dataframe format
reviews_df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')
print(reviews_df)

# Separate lists for Review and Liked columns
review_column = reviews_df["Review"]
liked_column = reviews_df["Liked"]


In [None]:

print(f"\nThe number of dislike (0) reviews in the data:\n{liked_column.tolist().count(0)}")
print(f"\nThe number of like (1) reviews in the data:\n{liked_column.tolist().count(1)}")

----

# Task 1

Use initially SentiStrength SentiStrength - sentiment strength detection in short texts - sentiment analysis,
opinion mining (http://sentistrength.wlv.ac.uk/) implementation of sentiment, which provides negative and positive
sentiment score, compute Pearson correlation between this constructed sentiment polarity and the annotation.

In [None]:

# Initialize SentiStrength
# http://sentistrength.wlv.ac.uk/
senti_strength_path = "./sentistrength/SentiStrength.jar"
language_folder_path = "./sentistrength/SentiStrength_Data/"

# Check that the files exist in the given paths
if not os.path.isfile(senti_strength_path):
    print("SentiStrength not found at: ", senti_strength_path)
if not os.path.isdir(language_folder_path):
    print("SentiStrength data folder not found at: ", language_folder_path)

# Returns SentiStrength sentiment score for the given string
def rate_sentiment(sentiString):
    # Open a subprocess using shlex to get the command line string into the correct args list format.
    p = subprocess.Popen(shlex.split("java -jar '" + senti_strength_path + "' stdin sentidata '" + language_folder_path + "'"),
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    # Communicate via stdin the string to be rated. Note that all spaces are replaced with +.
    # Can't send string in Python 3, must send bytes.
    b = bytes(sentiString.replace(" ","+"), 'utf-8')
    stdout_byte, stderr_text = p.communicate(b)
    stdout_text = stdout_byte.decode("utf-8")
    # Remove the tab spacing between the positive and negative ratings. e.g. "1    -5" -> "1 -5"
    stdout_text = stdout_text.rstrip().replace("\t"," ")
    return stdout_text

# Convert the score from 1 -5 to binary: 0 for dislike, 1 for like
# "Neutral" score, for example -2 2, is cast as 0
def score_to_binary(score_original):
    binary_score = score_original.split(' ')
    binary_score = list(binary_score)
    binary_score = [1 if int(binary_score[i]) > abs(int(binary_score[i+1])) else 0 for i in range(0, len(binary_score), 3)]
    return binary_score[0]


In [None]:

# SentiStrength usage example

example_sentence = "'What a lovely day!'"

rated = rate_sentiment(example_sentence)
converted = score_to_binary(rated)

print("\nRating the sentence:")
print(example_sentence)

print("\nRated by SentiStrength:")
print(rated)

print("\nRating converted to 0 for dislike / 1 for like:")
print(converted)


In [None]:

# Collect sentiments evaluated by SentiStrength for each review
# ! Running this takes a few minutes (approximately 3-5min)

sentistrength_sentiments = []

for row in review_column:
    rated_sentiment = rate_sentiment(row)
    converted_sentiment = score_to_binary(rated_sentiment)
    sentistrength_sentiments.append(converted_sentiment)


In [None]:

# Calculate Pearson correlation between the SentimentStrength evaluations and the liked column

pearson_corr_coeff = np.corrcoef(sentistrength_sentiments, liked_column)

print(f"\nPearson correlation coefficient matrix:\n{pearson_corr_coeff}")
print(f"\nPearson correlation between textblob sentiment and review liked score:\n{pearson_corr_coeff[1][0]}")


----

# Task 2

Repeat this process when considering the correlation of the positive class alone and the correlation of the negative class alone.

In [None]:

# Calculate overall cosine similarity
cosine_similarity = cosine(sentistrength_sentiments, liked_column)
print("\nOverall cosine similarity:", cosine_similarity)


# Calculate cosine similarity of positive class alone
positive_sentistrength_sentiments = []
positive_corresponding_liked_column = []

for index, rating in enumerate(sentistrength_sentiments):
    if rating == 1:
        positive_sentistrength_sentiments.append(sentistrength_sentiments[index])
        positive_corresponding_liked_column.append(liked_column[index])

positive_cosine_similarity = cosine(positive_sentistrength_sentiments, positive_corresponding_liked_column)
print("Positive cosine similarity:", positive_cosine_similarity)


# Calculate cosine similarity of negative class alone
negative_sentistrength_sentiments = []
negative_corresponding_liked_column = []

for index, rating in enumerate(sentistrength_sentiments):
    if rating == 0:
        # Add miniscule amount to the 0 vector to avoid division by 0 in the cosine similarity calculation
        negative_sentistrength_sentiments.append(sentistrength_sentiments[index] + 0.00000000000000001)
        negative_corresponding_liked_column.append(liked_column[index])

negative_cosine_similarity = cosine(negative_sentistrength_sentiments, negative_corresponding_liked_column)
print("Negative cosine similarity:", negative_cosine_similarity)


----

# Task 3

Now we want to test the correlation with respect to some stylistic aspects of the review. Write a script that estimate
 the number of personal pronouns and number of adjectives and number of adverbs using part-of-speech tagger of your
  choice. Compute both the cosine similarity between each of the above attributes (number of pronouns, number of
   adjectives, number of adverbs) and the annotation.

In [None]:

# Printing utility functions
def print_pearson_corr(first_thing_name, second_thing_name, pearson_corr_coeff):
    print(f"Pearson correlation between {first_thing_name} and {second_thing_name}:\n{pearson_corr_coeff[1][0]}\n")

def print_cosine_similarity(first_thing_name, second_thing_name, cosine_similarity):
    print(f"Cosine similarity between {first_thing_name} and {second_thing_name}:\n{cosine_similarity}\n")


pronouns_in_review = []
adjectives_in_review = []
adverbs_in_review = []

# Count the pronouns, adjectives and adverbs in each review and add them to the corresponding lists
for row in review_column:
    pronoun_count = 0
    adjective_count = 0
    adverb_count = 0

    for word in row.split(" "):
        # Part of speech tag, pos_tag reference: https://www.guru99.com/pos-tagging-chunking-nltk.html
        word_pos = pos_tag(word_tokenize(word))

        if word_pos[0][1] == "PRP":
            pronoun_count += 1
        if word_pos[0][1] == "JJ" or word_pos[0][1] == "JJR" or word_pos[0][1] == "JJS":
            adjective_count += 1
        if word_pos[0][1] == "RB" or word_pos[0][1] == "WBR":
            adverb_count += 1

    pronouns_in_review.append(pronoun_count)
    adjectives_in_review.append(adjective_count)
    adverbs_in_review.append(adverb_count)


print("\n---PEARSON CORRELATIONS---\n")
pearson_corr_coeff_pronoun = np.corrcoef(pronouns_in_review, liked_column)
pearson_corr_coeff_adjective = np.corrcoef(adjectives_in_review, liked_column)
pearson_corr_coeff_adverb = np.corrcoef(adverbs_in_review, liked_column)
print_pearson_corr("number of PRONOUNS", "review liked score", pearson_corr_coeff_pronoun)
print_pearson_corr("number of ADJECTIVES", "review liked score", pearson_corr_coeff_adjective)
print_pearson_corr("number of ADVERBS", "review liked score", pearson_corr_coeff_adverb)

print("\n---COSINE SIMILARITIES---\n")
cosine_similarity_pronoun = cosine(pronouns_in_review, liked_column)
cosine_similarity_adjective = cosine(adjectives_in_review, liked_column)
cosine_similarity_adverb = cosine(adverbs_in_review, liked_column)
print_cosine_similarity("number of PRONOUNS", "review liked score", cosine_similarity_pronoun)
print_cosine_similarity("number of ADJECTIVES", "review liked score", cosine_similarity_adjective)
print_cosine_similarity("number of ADVERBS", "review liked score", cosine_similarity_adverb)

----

# Task 4

We want to test the hypothesis that the opinion of about the restaurant is constructed according to Price, Quality
 of food served in the restaurant, and friendly staff. Suggest a script that allows you to identify Review that are
  more focused on price, quality of food, friendly staff. You may consider a set of keywords that are most suitable
   to each category and then use simple string matching to match this effect. For each category, generate a binary
    vector indicating whether the given review focuses on the corresponding category.

In [None]:

def any_keywords_in_string(keywords, row):
    words = word_tokenize(row.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in words] # Lemmatisation

    if any(keyword in words for keyword in keywords):
        return(1)
    else:
        return(0)

keyword_file = open("keywords.json", "r")
keywords = json.loads(keyword_file.read())
keyword_file.close()

price_focus = []
quality_focus = []
staff_focus = []

for row in review_column:
    price_focus.append(any_keywords_in_string(keywords["price"], row))
    quality_focus.append(any_keywords_in_string(keywords["quality"], row))
    staff_focus.append(any_keywords_in_string(keywords["staff"], row))


----

# Task 5

Estimate the correlation using Pearson correlation between each vector category and the data annotation.

In [None]:

pearson_corr_coeff_price = np.corrcoef(price_focus, liked_column)
pearson_corr_coeff_quality = np.corrcoef(quality_focus, liked_column)
pearson_corr_coeff_staff = np.corrcoef(staff_focus, liked_column)

print_pearson_corr("PRICE","liked score", pearson_corr_coeff_price)
print_pearson_corr("QUALITY","liked score", pearson_corr_coeff_quality)
print_pearson_corr("STAFF", "liked score", pearson_corr_coeff)


----

# Task 6

We want to revisit the construction of the categories in 4). Instead of string matching, use the semantic
similarity in the following way. Calculate the Wu and Palmer similarity between “price” and the Review (using
the sentence-to-sentence similarity as in labs), repeat this process for the other three categories by suggestion
a representative keyword (s) that will be used to calculate sentence-to-sentence similarity score.

In [None]:

# Task 6 TODO




----

# Task 7

We want to test another approach for computing the categories by using the empath categories embedding. For this
purpose, re-visit the naming of the empath-categories in GitHub - Ejhfast/empath-client: analyze text with empath
(https://github.com/Ejhfast/empath-client) and select those that might be linked to Price, Quality, Staff friendship.
Write a code that allows you to determine appropriate categories from this embedding and then calculate the correlation
score.  Alternative to manual scrutinization of the Empath categories, you may also generate an empath category
embedding for the keyword “price”, “food quality”, “friendly staff”, and then compute cosine similarity between the
Review embedding vector and each of the above four embedding vectors, so that the one that yields the highest similarity
score will be considered as the one that best represents the underlined category.

In [None]:

lexicon = Empath()

print("\nPrice category:")
lexicon.create_category("price", keywords["price"], size=50)

print("\nFood quality category:")
lexicon.create_category("quality", keywords["quality"], size=50)

print("\nStaff friendliness category:")
lexicon.create_category("staff", keywords["staff"], size=50)

empath_price = []
empath_quality = []
empath_staff = []

for row in review_column:
    empath_price.append(lexicon.analyze(row, categories=["price"], normalize=True).get("price"))
    empath_quality.append(lexicon.analyze(row, categories=["quality"], normalize=True).get("quality"))
    empath_staff.append(lexicon.analyze(row, categories=["staff"], normalize=True).get("staff"))


In [None]:

print("\n---EMPATH COSINE SIMILARITIES---\n")

cosine_sim_empath_price = cosine(empath_price, liked_column)
cosine_sim_empath_quality = cosine(empath_quality, liked_column)
cosine_sim_empath_staff = cosine(empath_staff, liked_column)
print_cosine_similarity("number of Empath PRICE", "review liked score", cosine_sim_empath_price)
print_cosine_similarity("number of Empath QUALITY", "review liked score", cosine_sim_empath_quality)
print_cosine_similarity("number of Empath STAFF", "review liked score", cosine_sim_empath_staff)


----

# Task 8

We want to further emphasize on misclassified reviews. For this purpose, concatenate all reviews for which the
sentiment score is positive while the annotation is zero and those for which the sentiment is zero while the
annotation is 1. Construct the Wordcloud of this dataset. Write a histogram showing the 10 most common wordings
in this dataset. Comment on the findings.

In [None]:

# Collect misclassified ratings
# ! Running this takes a few minutes (approximately 3-5min)

misclassified_reviews = []

for index, row in enumerate(review_column):
    rated_sentiment = rate_sentiment(row)
    converted_sentiment = score_to_binary(rated_sentiment)

    if converted_sentiment != liked_column[index]:
        misclassified_reviews.append([row, liked_column[index]])

print("\nNumber of misclassified ratings:")
print(len(misclassified_reviews))


In [None]:

# Pre-processing: lemmatisation, turn all chars to lowercase, leave out stopwords and words containing numbers
# Returns a list of tokens
def pre_process(doc_string):
    # Use stopwords from nltk
    STOPWORDS = list(set(nltk.corpus.stopwords.words('english')))

    # Tokenize words and turn them to lowercase
    words = word_tokenize(doc_string.lower())

    # Lemmatisation
    words = [WordNetLemmatizer().lemmatize(word, pos="v") for word in words]

    # Leave out stopwords and words containing numbers
    words = [
        word for word in words
        if word.isalpha()
        and word not in STOPWORDS
    ]
    return words


# Pre-process the collected misclassified ratings
misclassified_reviews_string = ""
for review in misclassified_reviews:
    misclassified_reviews_string += f"{review[0]} "
misclassified_reviews_preprocessed = pre_process(misclassified_reviews_string)

print("\nNumber of word tokens after preprocessing:")
print(len(misclassified_reviews_preprocessed), "\n")

print("Word tokens after preprocessing (print truncated to first 50 elements):")
print(misclassified_reviews_preprocessed[:50], "\n")


In [None]:

# Collect 10 most common words
most_common_words = Counter(misclassified_reviews_preprocessed).most_common(10)

# Parse the generated tuples into two different arrays
words = [word for word, _ in most_common_words]
counts = [counts for _, counts in most_common_words]

# Plot the words and their frequency
plt.bar(words, counts)
plt.title("Histogram of the 10 most common words in misclassified reviews")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()


In [None]:

# Generate word cloud of the misclassified reviews

word_cloud = WordCloud(
    width=3000,
    height=2000,
    random_state=1,
    background_color="#1f1f36",
    colormap="Blues",
    collocations=False,
).generate(" ".join(misclassified_reviews_preprocessed))

print("\nWord cloud of the misclassified reviews:")

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(word_cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()


----

# Task 9.

Now we would like to build a machine learning model for sentiment analysis that takes into account the ambiguous
cases identified in 9). For this purpose, write and script and review the preprocessing and stopword list to not
discard relevant information in the context of sentiment analysis (e.g., avoid discarding negation cues, adjectives
that subsumes polarity and apostrophes, lower-case as capitalization brings emotion,..), then use TfIdfVectorizer
with a maximum feature set of 500, minimum 2 repetition and no more than 60% of word repetition across sentences.
Build this model for one dataset using randomly selected 70% training and 30% testing. Report the classification
accuracy.


### Prepare Training and Testing Data

In [None]:
misclassified_review_column = [review[0] for review in misclassified_reviews]
misclassified_liked_column = [review[1] for review in misclassified_reviews]

# Lemmatisation is the only type of pre-processing used for the machine learning models.
# Removing stopwords discards important negation cues. For example, the word "not" -> "not good" becomes "good".
# Turning the words to lowercase is not used because capitalization brings emotion. For example, FULL CAPITALIZATION.
misclassified_review_column = [WordNetLemmatizer().lemmatize(word) for word in misclassified_review_column]

# Divide the reviews into training and test data
review_train, review_test, liked_train, liked_test = train_test_split(
    misclassified_review_column,
    misclassified_liked_column,
    test_size=0.3
)

# Use tf-idf vectorizer to fit and transform review training data
TFIDF = TfidfVectorizer(max_features=500, max_df=0.60, min_df=2) 
tfidf_fit_trans_review = TFIDF.fit_transform(review_train)


### Support Vector Classification Model

In [None]:

# Support vector classification model
svc_model = SVC()

# Train the model with the review column tf-idf fitted and transformed training data, and liked column training data
svc_model.fit(
    tfidf_fit_trans_review,
    liked_train
)

# Make predictions using the model
svc_model_predictions = svc_model.predict(
    TFIDF.transform(review_test)
)

# Get accuracy of the predictions
svc_model_accuracy = accuracy_score(liked_test, svc_model_predictions)

print(f"\nSupport vector classification model accuracy:\n{svc_model_accuracy}")

### Logistic Regression Model

In [None]:

# Logistic regression model
log_reg_model = LogisticRegression()

# Train the model with the review column tf-idf fitted and transformed training data, and liked column training data
log_reg_model.fit(
    tfidf_fit_trans_review,
    liked_train
)

# Make predictions using the model
log_reg_model_predictions = log_reg_model.predict(
    TFIDF.transform(review_test)
)

# Get accuracy of the predictions
log_reg_model_accuracy = accuracy_score(liked_test, log_reg_model_predictions)

print(f"\nSupport vector classification model accuracy:\n{log_reg_model_accuracy}")


----

# Task 10

Use Glove embedding instead of TfidfVectorizer, see GloVe: Global Vectors for Word Representation
(https://nlp.stanford.edu/projects/glove/). Use the Glove embedding as feature vectors and test the performance in
the original data (30% test data) and report the classification accuracy on the other two datasets. Comment on the
limitations of the approach

----

# Task 11

Identify appropriate literature to comment on your findings and methodology.

In [None]:

# Task 11 TODO


