In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import nltk

# Suppressing warning that may occur in the code

In [2]:
import warnings
warnings.simplefilter("ignore", UserWarning)
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Replace infinity values with NaN before operations
df.replace([np.inf, -np.inf], np.nan, inplace=True)

NameError: name 'df' is not defined

In [None]:
#read in the data
df = pd.read_csv('../input/reviews/Reviews.csv');
print(df.shape);
df = df.head(300);
print(df.shape)

In [None]:
df['Text'].values[0]

In [None]:
## Just plotting the review based on stars
ax = df['Score'].value_counts().sort_index().plot(
    kind = 'bar', title = 'count be ratings',figsize = (10,5));
ax.set_xlabel('ratings');
plt.show()

In [None]:
## do the basic nltk 
example = df['Text'][43]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

## VADER EVALUATIONS


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('this is a positive as well as a negative review')

In [None]:
sia.polarity_scores(example)

In [None]:
# run the polarity score check on the entire dataset
res = {}
for i , row in tqdm(df.iterrows() , total = len(df)):
    text = row['Text']
    myId = row['Id']
    res[myId] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index':'Id'})
vaders = vaders.merge(df,how = 'left')

In [None]:
# now we have both sentiment data and the metadata together
vaders.head()

In [None]:
ax = sns.barplot(data = vaders , x = "Score",y = 'compound')
ax.set_title('Compound Score by ratings')
plt.show()

In [None]:
fig , axs = plt.subplots(1,3,figsize = (11,3))
sns.barplot(data = vaders, x = 'Score' , y = 'pos',ax = axs[0] )
sns.barplot(data = vaders, x = 'Score' , y = 'neu',ax = axs[1] )
sns.barplot(data = vaders, x = 'Score' , y = 'neg',ax = axs[2] )
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.show()

# Roberta Pretrained Model
* use a model trained of a large corpus of data
* Transform model accounts for the words but also the context realted to other words 

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#vader results on example
print(example)
sia.polarity_scores(example)

In [None]:
encoded_text = tokenizer(example , return_tensors = 'pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {'roberta_neg' : scores[0],
              'roberta_neu' : scores[1],
              'roberta_neg' : scores[2]}

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example , return_tensors = 'pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
              'roberta_neg' : scores[0],
              'roberta_neu' : scores[1],
              'roberta_pos' : scores[2]}
    return scores_dict

In [None]:
res = {}
for i , row in tqdm(df.iterrows() , total = len(df)):
 try:
    text = row['Text']
    myId = row['Id']
    vader_result = sia.polarity_scores(text)
    vader_result_rename = {}
    for key , value in vader_result.items():
        vader_result_rename[f"vader_{key}"] = value
    roberta_result = polarity_scores_roberta(text)
    both = {**vader_result_rename , **roberta_result}
    res[myId] = both
 except RuntimeError:
    print(f'Broke for id {myId}')
    

In [None]:
result_df = pd.DataFrame(res).T
result_df = result_df.reset_index().rename(columns = {'index':'Id'})
result_df = result_df.merge(df,how = 'left')

# Compare Scores between models

In [None]:
result_df.columns

In [None]:
sns.pairplot(data = result_df , vars = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound',
       'roberta_neg', 'roberta_neu', 'roberta_pos'],hue = 'Score',palette = 'tab10')
plt.show()

# Sentiment is positive but the review is low

In [None]:
result_df.query('Score == 1').sort_values(
    'roberta_pos',ascending = False)['Text'].values[0]

In [None]:
result_df.query('Score == 1').sort_values(
    'vader_pos',ascending = False)['Text'].values[0]

# Sentiment is negative but the review is high example

In [None]:
result_df.query('Score == 5').sort_values(
    'roberta_neg',ascending = False)['Text'].values[0]

In [None]:
result_df.query('Score == 5').sort_values(
    'vader_neg',ascending = False)['Text'].values[0]