In [1]:
# Write your imports here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pkola\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Working with Text Lab
## Information retrieval, preprocessing, and feature extraction

In this lab, you'll be looking at and exploring European restaurant reviews. The dataset is rather tiny, but that's just because it has to run on any machine. In real life, just like with images, texts can be several terabytes long.

The dataset is located [here](https://www.kaggle.com/datasets/gorororororo23/european-restaurant-reviews) and as always, it's been provided to you in the `data/` folder.

### Problem 1. Read the dataset (1 point)
Read the dataset, get acquainted with it. Ensure the data is valid before you proceed.

How many observations are there? Which country is the most represented? What time range does the dataset represent?

Is the sample balanced in terms of restaurants, i.e., do you have an equal number of reviews for each one? Most importantly, is the dataset balanced in terms of **sentiment**?

In [2]:
df = pd.read_csv("data/European Restaurant Reviews.csv")
df

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...
...,...,...,...,...,...,...
1497,Cuba,Old Square (Plaza Vieja),Negative,The Tourism Trap,Oct 2016 •,Despite the other reviews saying that this is ...
1498,Cuba,Old Square (Plaza Vieja),Negative,the beer factory,Oct 2016 •,beer is good. food is awfull The only decent...
1499,Cuba,Old Square (Plaza Vieja),Negative,brewery,Oct 2016 •,"for terrible service of a truly comedic level,..."
1500,Cuba,Old Square (Plaza Vieja),Negative,It's nothing exciting over there,Oct 2016 •,We visited the Havana's Club Museum which is l...


In [3]:
df['Country'].value_counts()

Country
France     512
Italy      318
Morroco    210
Cuba       146
Poland     135
Russia     100
India       81
Name: count, dtype: int64

In [4]:
for i in range (len(df['Review Date'])):
    new_date = str(df['Review Date'][i])
    df['Review Date'][i] = new_date[:-2]

In [5]:
df = df.drop(df[df['Review Date'] == 'Sep 20'].index)
df['Review Date'] = pd.to_datetime(df['Review Date'], format='mixed')
df['Review Date'] = pd.to_datetime(df['Review Date'], format='%m%Y').dt.to_period('m')

In [6]:
df['Review Date'].unique()

<PeriodArray>
['2024-05', '2024-02', '2023-11', '2023-03', '2022-11', '2021-07', '2020-01',
 '2019-10', '2019-06', '2019-05',
 ...
 '2019-11', '2019-12', '2024-07', '2014-09', '2013-10', '2023-01', '2022-05',
 '2021-09', '2020-03', '2020-04']
Length: 138, dtype: period[M]

In [7]:
df['Sentiment'].value_counts()

Sentiment
Positive    1234
Negative     255
Name: count, dtype: int64

In [8]:
df['Restaurant Name'].value_counts()

Restaurant Name
The Frog at Bercy Village                512
Ad Hoc Ristorante (Piazza del Popolo)    318
The LOFT                                 210
Old Square (Plaza Vieja)                 136
Stara Kamienica                          135
Pelmenya                                  97
Mosaic                                    81
Name: count, dtype: int64

In [9]:
df.dropna(subset=['Restaurant Name'])
df.reset_index(inplace=True)
df = df.drop('index', axis=1)
df

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,2024-05,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,2024-02,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,2023-11,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,2023-03,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,2022-11,From the start this meal was bad- especially g...
...,...,...,...,...,...,...
1484,Cuba,Old Square (Plaza Vieja),Negative,The Tourism Trap,2016-10,Despite the other reviews saying that this is ...
1485,Cuba,Old Square (Plaza Vieja),Negative,the beer factory,2016-10,beer is good. food is awfull The only decent...
1486,Cuba,Old Square (Plaza Vieja),Negative,brewery,2016-10,"for terrible service of a truly comedic level,..."
1487,Cuba,Old Square (Plaza Vieja),Negative,It's nothing exciting over there,2016-10,We visited the Havana's Club Museum which is l...


In [10]:
new_column_names = []

for name in df.columns:
    new_name = name.replace(" ", "_").lower()
    new_column_names.append(new_name)

df.columns = new_column_names
df.columns

Index(['country', 'restaurant_name', 'sentiment', 'review_title',
       'review_date', 'review'],
      dtype='object')

### Problem 2. Getting acquainted with reviews (1 point)
Are positive comments typically shorter or longer? Try to define a good, robust metric for "length" of a text; it's not necessary just the character count. Can you explain your findings?

In [11]:
# We will look at both character and word counts, so we can see if there is a big difference between the two

In [12]:
total_positive_ch_length = 0
total_negative_ch_length = 0
total_positive_count = 0
total_negative_count = 0

for i in range (len(df.review)):
    if df.sentiment[i] == 'Positive':
        total_positive_ch_length += len(df.review[i])
        total_positive_count += 1
    else:
        total_negative_ch_length += len(df.review[i])
        total_negative_count += 1

avg_positive_ch_length = total_positive_ch_length / total_positive_count
avg_negative_ch_length = total_negative_ch_length / total_negative_count
avg_positive_ch_length, avg_negative_ch_length

(281.79578606158833, 781.8039215686274)

In [13]:
total_positive_word_length = 0
total_negative_word_length = 0

for i in range (len(df.review)):
    if df.sentiment[i] == 'Positive':
        total_positive_word_length += len(df.review[i].split(' '))
    else:
        total_negative_word_length += len(df.review[i].split(' '))

avg_positive_word_length = total_positive_word_length / total_positive_count
avg_negative_word_length = total_negative_word_length / total_negative_count
avg_positive_word_length, avg_negative_word_length

(50.37277147487844, 144.2313725490196)

In [14]:
# The ratios between both the average character and word lengths seem quite similar.
# As the review title may also contain some information, we will check the word count there as well

total_positive_word_length_titles = 0
total_negative_word_length_titles = 0

for i in range (len(df.review)):
    if df.sentiment[i] == 'Positive':
        total_positive_word_length_titles += len(df.review_title[i].split(' '))
    else:
        total_negative_word_length_titles += len(df.review_title[i].split(' '))

avg_positive_word_length_titles = total_positive_word_length_titles / total_positive_count
avg_negative_word_length_titles = total_negative_word_length_titles / total_negative_count
avg_positive_word_length_titles, avg_negative_word_length_titles

(3.9027552674230144, 4.698039215686275)

In [15]:
# While the ratio isn't the same, it is still slightly longer for negative comments on average
# We will therefore maintain that negative reveiws are on average longer than positive ones

### Problem 3. Preprocess the review content (2 points)
You'll likely need to do this while working on the problems below, but try to synthesize (and document!) your preprocessing here. Your tasks will revolve around words and their connection to sentiment. While preprocessing, keep in mind the domain (restaurant reviews) and the task (sentiment analysis).

In [16]:
df['review_words'] = df.review.str.split('\s+')

In [17]:
df.review_words = df.review_words.apply(lambda word_list: [w.lower() for w in word_list if w.lower() not in stopwords.words('english')])
df.review_words

0       [manager, became, agressive, said, carbonara, ...
1       [ordered, beef, fillet, ask, done, medium,, go...
2       [attractive, venue, welcoming,, albeit, somewh...
3       [sadly, used, high, tripadvisor, rating, liter...
4       [start, meal, bad-, especially, given, price.,...
                              ...                        
1484    [despite, reviews, saying, 'lovely, place, han...
1485    [beer, good., food, awfull, decent, thing, shi...
1486    [terrible, service, truly, comedic, level,, 2/...
1487    [visited, havana's, club, museum, located, old...
1488    [food, service, awful., pretty, stop., good, p...
Name: review_words, Length: 1489, dtype: object

In [18]:
def clean_characters(words_list, ch_list = ['-', '.', ',', '?', '!', '"', "'"]):
    cleaned_list = words_list.copy()
    print(cleaned_list)
    for ch in ch_list:
        cleaned_list = [w.replace(ch, '') for w in cleaned_list]
    cleaned_list = [w for w in cleaned_list if w != '']
    print(cleaned_list)
    return cleaned_list

In [19]:
df.review_words = df.review_words.apply(clean_characters)

['manager', 'became', 'agressive', 'said', 'carbonara', 'good.', 'rude.', 'bad', 'experience.', 'worst', 'rome', 'many', 'years.']
['manager', 'became', 'agressive', 'said', 'carbonara', 'good', 'rude', 'bad', 'experience', 'worst', 'rome', 'many', 'years']
['ordered', 'beef', 'fillet', 'ask', 'done', 'medium,', 'got', 'well', 'done', 'cooked', 'dry,', 'told', 'took', 'steak', '20', 'minutes', 'brought', 'another', 'steak', 'completely', 'rare,', 'left', 'steak', ',', 'asked', 'charged', 'fool', 'price.']
['ordered', 'beef', 'fillet', 'ask', 'done', 'medium', 'got', 'well', 'done', 'cooked', 'dry', 'told', 'took', 'steak', '20', 'minutes', 'brought', 'another', 'steak', 'completely', 'rare', 'left', 'steak', 'asked', 'charged', 'fool', 'price']
['attractive', 'venue', 'welcoming,', 'albeit', 'somewhat', 'slow', 'service.', 'offerings', 'pleasingly', 'presented', 'everything', 'taste.', 'though', 'ingredients', 'assembled', 'without', 'seasoning:', 'lovely', 'restaurant,', 'except', 'fo

In [20]:
df.review_words

0       [manager, became, agressive, said, carbonara, ...
1       [ordered, beef, fillet, ask, done, medium, got...
2       [attractive, venue, welcoming, albeit, somewha...
3       [sadly, used, high, tripadvisor, rating, liter...
4       [start, meal, bad, especially, given, price, v...
                              ...                        
1484    [despite, reviews, saying, lovely, place, hang...
1485    [beer, good, food, awfull, decent, thing, shis...
1486    [terrible, service, truly, comedic, level, 2/3...
1487    [visited, havanas, club, museum, located, old,...
1488    [food, service, awful, pretty, stop, good, pho...
Name: review_words, Length: 1489, dtype: object

In [21]:
df['review_title_words'] = df.review_title.str.split('\s+')
df.review_title_words = df.review_title_words.apply(lambda word_list: [w.lower() for w in word_list if w.lower() not in stopwords.words('english')])
df.review_title_words = df.review_title_words.apply(clean_characters)
df.review_title_words

['rude', 'manager']
['rude', 'manager']
['big', 'disappointment']
['big', 'disappointment']
['pretty', 'place', 'bland', 'food']
['pretty', 'place', 'bland', 'food']
['great', 'service', 'wine', 'inedible', 'food']
['great', 'service', 'wine', 'inedible', 'food']
['avoid-', 'worst', 'meal', 'rome', '-', 'possibly', 'ever']
['avoid', 'worst', 'meal', 'rome', 'possibly', 'ever']
['shocking', 'management,', 'terrible', 'service', 'mum', 'daughter!!', 'tourist', 'only!!']
['shocking', 'management', 'terrible', 'service', 'mum', 'daughter', 'tourist', 'only']
['tired', 'tasting', 'menu', '-', 'avoid']
['tired', 'tasting', 'menu', 'avoid']
['huge', 'disappointment']
['huge', 'disappointment']
['expensive', 'mediocre', 'food', 'service']
['expensive', 'mediocre', 'food', 'service']
['around', 'awful']
['around', 'awful']
['big', 'disappointment', 'due', 'poor', 'service', 'arrogant', 'attitudes']
['big', 'disappointment', 'due', 'poor', 'service', 'arrogant', 'attitudes']
['perfectly', 'organ

0                                  [rude, manager]
1                            [big, disappointment]
2                     [pretty, place, bland, food]
3           [great, service, wine, inedible, food]
4       [avoid, worst, meal, rome, possibly, ever]
                           ...                    
1484                               [tourism, trap]
1485                               [beer, factory]
1486                                     [brewery]
1487                           [nothing, exciting]
1488                               [tourist, trap]
Name: review_title_words, Length: 1489, dtype: object

### Problem 3. Top words (1 point)
Use a simple word tokenization and count the top 10 words in positive reviews; then the top 10 words in negative reviews*. Once again, try to define what "top" words means. Describe and document your process. Explain your results.

\* Okay, you may want to see top N words (with $N \ge 10$).

In [22]:
# We will check what the 20 most commonly used words are in each sentiment category, so as to give ourselves a bit more context
df_sentiment_grouped = df[['sentiment', 'review_words', 'review_title_words']]
df_sentiment_grouped = df_sentiment_grouped.groupby("sentiment", as_index=False).sum()

In [23]:
df_sentiment_grouped

Unnamed: 0,sentiment,review_words,review_title_words
0,Negative,"[manager, became, agressive, said, carbonara, ...","[rude, manager, big, disappointment, pretty, p..."
1,Positive,"[booked, frog, advance, pleased, busy…, great,...","[wonderful, excellent, family, dinner, nice, l..."


In [24]:
def sentiment_word_frequency_counter(data, col):
    counter = 0
    top_20_word_fr = []
    for word_list in data[col]:
        word_frequencies = Counter(word_list)
        top_20_word_fr.append(word_frequencies.most_common(20))
        counter += 1
        
    return top_20_word_fr[0], top_20_word_fr[1]

top_20_word_neg, top_20_word_pos = sentiment_word_frequency_counter(df_sentiment_grouped, 'review_words')

print(f'Top 20 {df_sentiment_grouped.sentiment[0]} words: {top_20_word_neg}')
print(f'Top 20 {df_sentiment_grouped.sentiment[1]} words: {top_20_word_pos}')

Top 20 Negative words: [('food', 243), ('us', 203), ('restaurant', 180), ('wine', 173), ('good', 145), ('service', 143), ('table', 140), ('menu', 131), ('one', 128), ('would', 127), ('rome', 100), ('place', 92), ('meal', 90), ('could', 87), ('nice', 86), ('waitress', 85), ('really', 85), ('time', 84), ('asked', 83), ('like', 82)]
Top 20 Positive words: [('food', 728), ('great', 569), ('service', 534), ('good', 503), ('place', 368), ('restaurant', 352), ('nice', 300), ('staff', 254), ('wine', 239), ('friendly', 234), ('menu', 233), ('excellent', 228), ('delicious', 221), ('really', 215), ('us', 212), ('amazing', 188), ('recommend', 187), ('atmosphere', 182), ('time', 177), ('would', 173)]


In [25]:
# Interestingly enough, both top 20 lists contain exclusively positive and neutral words, regardless of the sentiment.
# As we have removed all stopwords from this column however, it might not be that much of a surprise
# Just for some context, let's compare how often some negative words appear in both categories
negative_w_list = ['bad', 'disappointed', 'poor', 'horrible', 'overpriced', 'awful', 'bland', 'disgusting']
negative_occurances = []
positive_occurances = []

for w in negative_w_list:
    negative_occurances.append(df_sentiment_grouped.review_words.iloc[0].count(w))
    positive_occurances.append(df_sentiment_grouped.review_words.iloc[1].count(w))
    
for i in range(len(negative_w_list)):
    print(f'{negative_w_list[i]}: negative({negative_occurances[i]}), positive({positive_occurances[i]})')

bad: negative(40), positive(4)
disappointed: negative(35), positive(19)
poor: negative(23), positive(5)
horrible: negative(4), positive(0)
overpriced: negative(5), positive(1)
awful: negative(18), positive(1)
bland: negative(12), positive(1)
disgusting: negative(2), positive(0)


### Problem 4. Review titles (2 point)
How do the top words you found in the last problem correlate to the review titles? Do the top 10 words (for each sentiment) appear in the titles at all? Do reviews which contain one or more of the top words have the same words in their titles?

Does the title of a comment present a good summary of its content? That is, are the titles descriptive, or are they simply meant to catch the attention of the reader?

In [26]:
top_20_title_word_neg, top_20_title_word_pos = sentiment_word_frequency_counter(df_sentiment_grouped, 'review_title_words')

print(f'Top 20 {df_sentiment_grouped.sentiment[0]} words: {top_20_title_word_neg}')
print(f'Top 20 {df_sentiment_grouped.sentiment[1]} words: {top_20_title_word_pos}')

Top 20 Negative words: [('food', 32), ('service', 25), ('disappointing', 21), ('bad', 15), ('place', 14), ('rome', 14), ('great', 12), ('terrible', 12), ('restaurant', 11), ('average', 10), ('poor', 9), ('overrated', 9), ('overpriced', 9), ('ad', 9), ('hoc', 8), ('wine', 7), ('expensive', 7), ('experience', 7), ('ok', 7), ('disappointed', 7)]
Top 20 Positive words: [('great', 225), ('food', 176), ('good', 105), ('place', 102), ('service', 95), ('excellent', 78), ('best', 78), ('dinner', 74), ('restaurant', 72), ('amazing', 66), ('experience', 63), ('nice', 48), ('delicious', 42), ('wonderful', 41), ('lovely', 37), ('beer', 33), ('atmosphere', 31), ('meal', 31), ('fantastic', 30), ('perfect', 27)]


In [27]:
# At first glance, there is a much bigger difference in the word sentiment used in the titles, compared to the actual reviews
# Let's check if reviews which contain any of the top words have the same words in their titles
top_20_word_neg_cleaned = [el[0] for el in top_20_word_neg]
top_20_word_pos_cleaned = [el[0] for el in top_20_word_pos]

In [28]:
results = []
for r_list in df.review_words:
    for w in r_list:
        if w in top_20_word_neg_cleaned:
            results.append(w in top_20_title_word_neg)
        elif w in top_20_word_pos_cleaned:
            results.append(w in top_20_title_word_pos)

results.count(False), results.count(True)

(9477, 0)

In [29]:
# It seems that, at least where the top 20 words in reviews are concerned, reviewers never use those words in the actual texts
# Nonetheless, since the top 20 words in the titles appear to be more reflective of the reviews' sentiment, they are not necessarily misleading overall
######

### Problem 5. Bag of words (1 point)
Based on your findings so far, come up with a good set of settings (hyperparameters) for a bag-of-words model for review titles and contents. It's easiest to treat them separately (so, create two models); but you may also think about a unified representation. I find the simplest way of concatenating the title and content too simplistic to be useful, as it doesn't allow you to treat the title differently (e.g., by giving it more weight).

The documentation for `CountVectorizer` is [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). Familiarize yourself with all settings; try out different combinations and come up with a final model; or rather - two models :).

In [30]:
vectorizer_review_content = CountVectorizer(ngram_range = (1, 2), min_df = 3, max_df = 0.80, stop_words='english')
vectorizer_review_content.fit(df.review)

In [31]:
vectorizer_review_content.vocabulary_

{'manager': 2470,
 'said': 3535,
 'carbonara': 589,
 'good': 1745,
 'rude': 3511,
 'bad': 351,
 'experience': 1341,
 'worst': 4731,
 'rome': 3478,
 'years': 4754,
 'ordered': 2827,
 'beef': 394,
 'fillet': 1455,
 'ask': 249,
 'medium': 2527,
 'got': 1795,
 'cooked': 813,
 'dry': 1153,
 'told': 4229,
 'took': 4243,
 'steak': 3991,
 '20': 29,
 'minutes': 2600,
 'brought': 540,
 'completely': 778,
 'rare': 3224,
 'left': 2274,
 'asked': 255,
 'charged': 638,
 'price': 3125,
 'ordered beef': 2828,
 'beef fillet': 397,
 '20 minutes': 31,
 'venue': 4446,
 'welcoming': 4614,
 'somewhat': 3828,
 'slow': 3800,
 'service': 3672,
 'presented': 3113,
 'taste': 4112,
 'ingredients': 2101,
 'just': 2172,
 'lovely': 2414,
 'restaurant': 3371,
 'food': 1511,
 'slow service': 3803,
 'lovely restaurant': 2425,
 'restaurant food': 3386,
 'sadly': 3527,
 'used': 4408,
 'high': 1954,
 'tripadvisor': 4308,
 'rating': 3227,
 'literally': 2324,
 'eat': 1181,
 'number': 2760,
 'fine': 1464,
 'restaurants': 341

In [32]:
vectorizer_review_title = CountVectorizer(ngram_range = (1, 2), min_df = 3, max_df = 0.70, stop_words='english')
vectorizer_review_title.fit(df.review_title)

In [33]:
vectorizer_review_title.vocabulary_

{'rude': 341,
 'big': 52,
 'disappointment': 107,
 'big disappointment': 53,
 'pretty': 303,
 'place': 288,
 'food': 149,
 'great': 182,
 'service': 347,
 'wine': 429,
 'great service': 198,
 'worst': 443,
 'meal': 255,
 'rome': 336,
 'meal rome': 256,
 'management': 254,
 'terrible': 389,
 'tourist': 398,
 'terrible service': 391,
 'tasting': 384,
 'menu': 259,
 'tasting menu': 386,
 'expensive': 132,
 'mediocre': 257,
 'food service': 158,
 'awful': 31,
 'poor': 300,
 'poor service': 301,
 'disappointing': 105,
 'wines': 432,
 'average': 28,
 'average food': 29,
 'new': 268,
 'truffle': 407,
 'life': 235,
 'hours': 219,
 'just': 228,
 'won': 434,
 'going': 173,
 'bad': 32,
 'experience': 134,
 'rated': 312,
 'restaurant': 323,
 'dinner': 101,
 'special': 358,
 'highly': 212,
 'night': 274,
 'don': 110,
 'money': 262,
 'drink': 112,
 'music': 267,
 'nice': 269,
 'taste': 383,
 'atmosphere': 23,
 'ok': 275,
 'service great': 350,
 'great atmosphere': 183,
 'choice': 70,
 'disappointed'

### Problem 6. Deep sentiment analysis models (1 point)
Find a suitable model for sentiment analysis in English. Without modifying, training, or fine-tuning the model, make it predict all contents (or better, combinations of titles and contents, if you can). Meaure the accuracy of the model compared to the `sentiment` column in the dataset.

In [34]:
labels = ['Positive'] * df['sentiment'].value_counts()['Positive'] + ['Negative'] * df['sentiment'].value_counts()['Negative']

In [35]:
analyzer = SentimentIntensityAnalyzer()
sentiment_scores = []

for r in df.review:
    sentiment_scores.append(analyzer.polarity_scores(r))

sentiment_scores

[{'neg': 0.377, 'neu': 0.623, 'pos': 0.0, 'compound': -0.9231},
 {'neg': 0.107, 'neu': 0.866, 'pos': 0.027, 'compound': -0.6705},
 {'neg': 0.158, 'neu': 0.758, 'pos': 0.084, 'compound': -0.6601},
 {'neg': 0.091, 'neu': 0.677, 'pos': 0.232, 'compound': 0.993},
 {'neg': 0.079, 'neu': 0.845, 'pos': 0.076, 'compound': 0.0224},
 {'neg': 0.084, 'neu': 0.878, 'pos': 0.039, 'compound': -0.9437},
 {'neg': 0.279, 'neu': 0.721, 'pos': 0.0, 'compound': -0.858},
 {'neg': 0.092, 'neu': 0.869, 'pos': 0.039, 'compound': -0.7263},
 {'neg': 0.142, 'neu': 0.791, 'pos': 0.066, 'compound': -0.3892},
 {'neg': 0.071, 'neu': 0.846, 'pos': 0.083, 'compound': 0.1872},
 {'neg': 0.114, 'neu': 0.764, 'pos': 0.121, 'compound': 0.6306},
 {'neg': 0.076, 'neu': 0.78, 'pos': 0.145, 'compound': 0.7338},
 {'neg': 0.063, 'neu': 0.797, 'pos': 0.14, 'compound': 0.9601},
 {'neg': 0.117, 'neu': 0.836, 'pos': 0.047, 'compound': -0.9714},
 {'neg': 0.045, 'neu': 0.723, 'pos': 0.232, 'compound': 0.9163},
 {'neg': 0.096, 'neu': 0.

In [36]:
# While the neutral scores seem to often greater than either positive or negative scores, we will not classify any as 'neutral'
# We will assume that any reviews with a negative score equal to the positive score will be classified as negative
# In the end, it appears that roughly eight out of every nine reviews had matching sentiments

df_new_sentiments = df.copy()
df_new_sentiments['new_sentiments'] = ['Negative' if s['neg'] >= s['pos'] else 'Positive' for s in sentiment_scores]

matching_sentiments_indexes = []
for i in range(len(df_new_sentiments.sentiment)):
    if df_new_sentiments.sentiment[i] == df_new_sentiments.new_sentiments[i]:
        matching_sentiments_indexes.append(i)

ratio_of_matches = len(matching_sentiments_indexes) / len(df_new_sentiments.sentiment)
ratio_of_matches

0.8824714573539288

### Problem 7. Deep features (embeddings) (1 point)
Use the same model to perform feature extraction on the review contents (or contents + titles) instead of direct predictions. You should already be familiar how to do that from your work on images.

Use the cosine similarity between texts to try to cluster them. Are there "similar" reviews (you'll need to find a way to measure similarity) across different restaurants? Are customers generally in agreement for the same restaurant?

### \* Problem 8. Explore and model at will
In this lab, we focused on preprocessing and feature extraction and we didn't really have a chance to train (or compare) models. The dataset is maybe too small to be conclusive, but feel free to play around with ready-made models, and train your own.