### Importing Libraries

In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
#nltk.downloader.download('vader_lexicon')
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax
import pandas as pd
from tqdm.notebook import tqdm

### Setting up the two approaches 

#### VADER

In [2]:
sia = SentimentIntensityAnalyzer()

def do_analysis_vader(text):
    return sia.polarity_scores(text)

#### Encoder Only Transformer Model

In [None]:
MODEL_NAME = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

def do_analysis_transformer(text):
    tokens = tokenizer(text, return_tensors='pt')
    output = model(**tokens)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

### Method to Compare Approaches

In [4]:
def compare_approaches(text):
    vader_results = do_analysis_vader(text)
    transformer_result = do_analysis_transformer(text)

    results = {"Category" : ["Negative", "Neutral", "Positive"],
               "VADER" : [vader_results["neg"], vader_results["neu"], vader_results["pos"]],
               "Transformer" : [transformer_result[0], transformer_result[1], transformer_result[2]]               
               }
    results_df = pd.DataFrame(results)

    print("Text : ", text)
    print("\nResults\n", results_df)

### Comparison

#### Comparison using regular text messages

In [5]:
compare_approaches("I fell in the mud today")

Text :  I fell in the mud today

Results
    Category  VADER  Transformer
0  Negative    0.0     0.780168
1   Neutral    1.0     0.199560
2  Positive    0.0     0.020272


In [6]:
compare_approaches("I hate my life")

Text :  I hate my life

Results
    Category  VADER  Transformer
0  Negative  0.649     0.854381
1   Neutral  0.351     0.114677
2  Positive  0.000     0.030941


In [7]:
compare_approaches("Oh Great! I lost")

Text :  Oh Great! I lost

Results
    Category  VADER  Transformer
0  Negative  0.299     0.844328
1   Neutral  0.130     0.118721
2  Positive  0.571     0.036952


In [8]:
compare_approaches("You're really not that funny")

Text :  You're really not that funny

Results
    Category  VADER  Transformer
0  Negative    0.4     0.889141
1   Neutral    0.6     0.096598
2  Positive    0.0     0.014261


In [9]:
compare_approaches(":)")

Text :  :)

Results
    Category  VADER  Transformer
0  Negative    0.0     0.011871
1   Neutral    0.0     0.046397
2  Positive    1.0     0.941732


In [10]:
compare_approaches(":(")

Text :  :(

Results
    Category  VADER  Transformer
0  Negative    1.0     0.654244
1   Neutral    0.0     0.266964
2  Positive    0.0     0.078792


In [11]:
compare_approaches(":D")

Text :  :D

Results
    Category  VADER  Transformer
0  Negative    0.0     0.010884
1   Neutral    0.0     0.051635
2  Positive    1.0     0.937480


In [12]:
compare_approaches("I hate myself :)")

Text :  I hate myself :)

Results
    Category  VADER  Transformer
0  Negative  0.481     0.891915
1   Neutral  0.130     0.087239
2  Positive  0.390     0.020846


#### Comparison using data set of reviews

In [19]:
# Importing the amazon reviews from csv file
review_data = pd.read_csv("../data/AmazonReviews.csv")
review_data.head()

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1
3,The cable was not compatible between my macboo...,1
4,The case is nice but did not have a glow light...,1


In [20]:
# Creating a new table to store results in
analysis_results = pd.DataFrame(columns=["Review", "Rating", "Vader_Neg", "Vader_Neu", "Vader_Pos", "Transformer_Neg", "Transformer_Neu", "Transformer_Pos"])

In [21]:
# Performing sentiment analysis on all the reviews using both the approaches and adding results to the new table

for i, row in tqdm(review_data.iterrows(), total=len(review_data)):
    try:
        review = row["Review"]
        rating = row["Sentiment"]

        vader_results = do_analysis_vader(review)
        transformer_result = do_analysis_transformer(review)

        new_row = [review, rating, vader_results["neg"], vader_results["neu"], vader_results["pos"], transformer_result[0], transformer_result[1], transformer_result[2]]
        analysis_results.loc[len(analysis_results)] = new_row
    except:
        continue

  0%|          | 0/25000 [00:00<?, ?it/s]

In [22]:
analysis_results.head()

Unnamed: 0,Review,Rating,Vader_Neg,Vader_Neu,Vader_Pos,Transformer_Neg,Transformer_Neu,Transformer_Pos
0,Fast shipping but this product is very cheaply...,1,0.0,0.832,0.168,0.437809,0.343967,0.218224
1,This case takes so long to ship and it's not e...,1,0.168,0.832,0.0,0.950509,0.04346,0.006031
2,Good for not droids. Not good for iPhones. You...,1,0.0,0.845,0.155,0.594396,0.346952,0.058652
3,The cable was not compatible between my macboo...,1,0.0,1.0,0.0,0.883488,0.105966,0.010546
4,The case is nice but did not have a glow light...,1,0.18,0.738,0.082,0.874157,0.109985,0.015858


In [23]:
analysis_results.to_csv("../data/AnalysisResults.csv")

In [24]:
len(analysis_results)

24651

Few iterations fail because length of review causes issues