In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
nltk.download('punkt_tab') # To use word_tokenize()
nltk.download('averaged_perceptron_tagger_eng') # To use pos_tag()
nltk.download("maxent_ne_chunker_tab") # To use chunk()
words = nltk.download("words")
nltk.download('vader_lexicon') # To use SentimentIntensityAnalyzer()

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv(path + '/Reviews.csv')

# df.head()
# df['Text'].values[0]

print(df.shape)

In [None]:
# Take the first 500 rows for easier calculation
df = df[:500]
print(df.shape)

## Quick EDA


In [None]:
ax = df['Score'].value_counts().sort_index().plot(kind="bar", title="Count of Reviews by starts", figsize=(10, 5))

ax.set_xlabel("Review Stars")
ax.set_ylabel("Number of Reviews")
plt.show()

## Basic NLTK


In [None]:
example = df['Text'].values[10]
print(example)

In [None]:
# NLTF Function 1 -- extracting the tokens
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
# NLTK Function 2 -- POS Tagging
tagged = nltk.pos_tag(tokens)
tagged[:10] # Link for pos_tags dictionary: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
# NLTK Function 3 -- Named Entity Recognition
  # Groups the POS tagged information together
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

# VADER Sentiment Scoring


Takes the words and gives a positive, negative, or neutral statement. Depending on the word, it gives the information about how positive, or how negative the word is
Bag of Words apporach:

- Stop words are removed
- Each word is scored and combined to get a total score


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

# SentimentAnalyzer object
sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores(example)

In [None]:
# Run polarity score for entire dataset

res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
  text = row['Text']
  myid = row["Id"]
  res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T #.T to make the table vertical or to flip the result
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
vaders.head() # this includes metadata and the sentiment score

In [None]:
ax = sns.barplot(data=vaders, x="Score", y="compound")
ax.set_title("Compound Score by Amazon Star Review")
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))

sns.barplot(data=vaders, x="Score", y="pos", ax=axs[0])
sns.barplot(data=vaders, x="Score", y="neu", ax=axs[1])
sns.barplot(data=vaders, x="Score", y="neg", ax=axs[2])

axs[0].set_title("Positive")
axs[1].set_title("Neutral")
axs[2].set_title("Negative")
plt.tight_layout()

plt.show()

# Roberta Pretrained Model

VADER might not be sufficient since it doesn't consider the relationship between the words. Therefore, we would need to use transformer models that do this


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
  encoded_text = tokenizer(example, return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      'roberta_neg' : scores[0],
      'roberta_neu' : scores[1],
      'roberta_pos' : scores[2]
  }
  return scores_dict

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):
  try:
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)
    vader_result_rename = {}
    for key, value in vader_result.items():
      vader_result_rename[f"vader_{key}"] = value
    roberta_result = polarity_scores_roberta(text)
    both = {**vader_result_rename, **roberta_result}
    res[myid] = both
  except RuntimeError:
    print(f'Broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T #.T to make the table vertical or to flip the result
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')

## Compare Scores between models


In [None]:
sns.pairplot(data=results_df, vars=['vader_neg', 'vader_neu', 'vader_pos', 'roberta_neg', 'roberta_neu', 'roberta_pos'], hue='Score', palette='tab10')
plt.show()

In [None]:
results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 1').sort_values('vader_pos', ascending=False)['Text'].values[0]

In [None]:
# negative sentiment 5 star review
results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5').sort_values('vader_neg', ascending=False)['Text'].values[0]

# Transformers Pipeline


In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

In [None]:
sent_pipeline('I am so happy!')

In [None]:
sent_pipeline("I hate this so much!")