In [None]:
# In this notebook we will be doing some sentiment analysis in python using two different techniques:

# VADER (Valence Aware Dictionary and sentiment Reasoner) - Bag of words approach
# ->VADER: each word individualy not analyze sequential meaning
# Roberta Pretrained Model from 🤗
# ->Roberta:analyze sequential meaning
# Huggingface Pipeline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [None]:
# Read in data
df = pd.read_csv('Reviews.csv')
df.shape

: 

In [None]:
df = df.head(2000) #data set is very large so here we take 20000 only
df.shape

: 

In [None]:
df.head()

: 

### EDA


In [None]:
df['Score'].value_counts().sort_index()

: 

In [None]:
ax=df['Score'].value_counts().sort_index().plot(kind='bar',title='Review count',figsize=(10,5))
ax.set_xlabel('Review Star')
plt.show()

: 

In [None]:
example=df['Text'][50]
print(example)

: 

In [None]:
tokens=nltk.word_tokenize(example)[:8]
tokens

: 

In [None]:
# nltk.download('averaged_perceptron_tagger')

: 

In [None]:
tagged=nltk.pos_tag(tokens)
tagged[:4]

: 

In [None]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

: 

In [None]:
entities=nltk.chunk.ne_chunk(tagged)
entities.pprint()

: 

### Approch 1: 

In [None]:
# approch1: VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach

: 

In [None]:
# nltk.download('vader_lexicon')

: 

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
sia=SentimentIntensityAnalyzer()

: 

In [None]:
sia.polarity_scores('that is good')

: 

In [None]:
sia.polarity_scores('that is not good')

: 

In [None]:
sia.polarity_scores(example)

: 

In [None]:
#polarity score for dataset
res={}
for i,row in tqdm(df.iterrows(),total=len(df)): #what is iterrows do????
    text=row['Text']
    myid=row['Id']
    res[myid]=sia.polarity_scores(text)

: 

In [None]:
pd.DataFrame(res)

: 

In [None]:
vaders=pd.DataFrame(res).T
vaders

: 

In [None]:
vaders=vaders.reset_index().rename(columns={'index':'Id'})
vaders

: 

In [None]:
vaders=vaders.merge(df,how='left')

: 

In [None]:
#Now we have sentiment score and metadata
vaders.head()

: 

In [None]:
ax=sns.barplot(data=vaders,x='Score',y='compound')
ax.set_title('compound score by amazon star review')
plt.show()

: 

In [None]:
ax=sns.barplot(data=vaders,x='Score',y='pos')
ax.set_title('positive score by amazon star review')
plt.show()

: 

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

: 

### Approch 2:

In [None]:
# approch 2: Roberta Pretrained Model
# Use a model trained of a large corpus of data.
# Transformer model accounts for the words but also the context related to other words.

: 

In [None]:
# !pip install torch torchvision
!pip install AutoModelForSequenceClassification'

: 

In [None]:
from transformers import AutoTokenizer#tokenize like nltk
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

: 

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

: 

In [None]:
# VADER results on example
print(example)
sia.polarity_scores(example)

: 

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

: 

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

: 

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)
    roberta_result = polarity_scores_roberta(text)
    break

: 

In [None]:
{**vader_result, **roberta_result}

: 

In [None]:
#rename vader 
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)
    vader_result_rename = {}
    for key, value in vader_result.items():
        vader_result_rename[f"vader_{key}"] = value
    roberta_result = polarity_scores_roberta(text)
    break

: 

In [None]:
{**vader_result_rename, **roberta_result}

: 

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

: 

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')

: 

In [None]:
results_df.head()

: 

In [None]:
#compare score between tow model
sns.pairplot(data=results_df,
            vars=['vader_neg','vader_neu','vader_pos',
                  'roberta_neg','roberta_neu','roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()

: 

In [None]:
#look at some example where the model scoring and review score differ the most

: 

In [None]:
#actual score is 1 but seems like positive at strting point(values[0] give more positive among 1 star review )
results_df.query('Score==1').sort_values('roberta_pos',ascending=False)['Text'].values[0]

: 

In [None]:
results_df.query('Score==1').sort_values('vader_pos',ascending=False)['Text'].values[0]

: 

In [None]:
results_df.query('Score==5').sort_values('roberta_neg',ascending=False)['Text'].values[0]

: 

In [None]:
results_df.query('Score==5').sort_values('vader_neg',ascending=False)['Text'].values[0]

: 

In [None]:
from transformers import pipeline
sent_pipeline=pipeline("sentiment-analysis") #download default model and embedding for this pipe line so only two line code
#can change diff model and tokenizer

: 

In [None]:
sent_pipeline('thats nice')

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 