# First Sentiment Analysis

| Author              | Description                                                                                                      |
|---------------------|------------------------------------------------------------------------------------------------------------------|
| Anh Tu Duong Nguyen | Using Sentiment Analysis to filter out suspicious reviews that do not match up with the assigned sentiment class |


In [9]:
import pandas as pd
from transformers import pipeline
import swifter

In [10]:
df = pd.read_csv('../data/df_stem_text.csv')
df

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score,sentiment
0,271781,bluejacket74,2017-03-17,"750 ml bottl , 2016 vintag , bottl # 304 360. ...",4.00,4.00,4.0,4.25,4.00,4.03,positive
1,125646,GratefulBeerGuy,2017-12-20,0 % 16 oz can . funni stori : final walk door ...,4.75,4.75,4.5,4.50,4.50,4.58,positive
2,150672,KingHoppy,2015-08-14,"pass swift current sk , stop origin joe 's lat...",4.00,4.00,4.5,4.25,4.25,4.28,positive
3,150672,biboergosum,2014-12-26,"355ml can , latest nascent craft breweri swift...",3.75,4.25,4.0,3.75,3.75,3.97,neutral
4,104824,Fcolle2,2014-05-27,batch 209 pour thick dark viscou black color t...,4.25,4.75,4.5,4.75,4.50,4.57,positive
...,...,...,...,...,...,...,...,...,...,...,...
1149504,58482,DIM,2009-09-04,a : hazi light coperi color golden edg . pour ...,4.00,4.00,3.5,3.00,3.50,3.60,neutral
1149505,58482,jdaelhousen,2009-08-28,pour produc solid contribut bodi head . perfec...,4.00,5.00,4.5,4.00,4.00,4.44,positive
1149506,58482,beerthulhu,2009-08-24,a : pour pumpkin pie orang gold hightlight swi...,4.00,3.00,4.0,3.50,3.50,3.61,neutral
1149507,58482,firkinhophead,2009-08-21,"pour hazi , almost cloudi , autumn gold . appa...",3.50,4.00,4.0,3.50,4.00,3.92,neutral


In [11]:
from tqdm import tqdm
tqdm.pandas()
import swifter
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    match score:
        case _ if score > 0:
            return 'positive'
        case _ if score < 0:
            return 'negative'
        case _ if score == 0:
            return 'neutral'

def get_vader_sentiment_score(text):
    score = sia.polarity_scores(text)['compound']
    return score

df['vader_prediction'], df['vader_sentiment_score'] = df['text'].swifter.apply(get_vader_sentiment), df["text"].swifter.apply(get_vader_sentiment_score)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/tu2/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Pandas Apply:   0%|          | 0/1149509 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1149509 [00:00<?, ?it/s]

In [12]:
mismatch_df = df[df['vader_prediction'] != df['sentiment']]
match_df = df[df['vader_prediction'] == df['sentiment']]
mismatch_df.to_csv("../data/mismatch_df.csv", index=False)
match_df.to_csv("../data/match_df.csv", index=False)

In [12]:
df.to_csv("../data/df_vader.csv", index=False)

# Results
* First simple sentiment analysis using vader library of nltk
* There was a split into a dataset where the sentiment matched and where the sentiment did not match

# Next Steps
* Maybe use some more sophisticated method like BERT
* Note: BERT was very slow on local machine -> Maybe someone with more compute can do this ;)
* Analysis the mismatches to see if there are other reasons for the mismatch
