# Sentiment Analysis TextBlob and VADER

In [1]:
from time import time
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
training_df = pd.read_csv('./raw-data/train_tweets.csv')
training_df = training_df[['label', 'tweet']]

In [3]:
# -1 negative sentiment, 0 neutral, 1 positive (no inbetweens)
def checkSentimentAccuracy(sentence_list, actual_list, analyzer):
    results = {
        'pos_match': 0,
        'pos_count': 0,
        'neg_match': 0,
        'neg_count': 0,
        'neu_count': 0
    }
    
    for sentence, actual in zip(sentence_list, actual_list):
        if actual == 0:
            results['pos_count'] += 1
        else:
            results['neg_count'] += 1
            
        score = analyzer(sentence)
        
        if score == 1 and actual == 0:
            results['pos_match'] += 1
        elif score == -1 and actual == 1:
            results['neg_match'] += 1
        elif score == 0:
            results['neu_count'] += 1
            
    return results

def printAccuracyResults(results):
    pos_accuracy = tb_results['pos_match']/tb_results['pos_count']*100
    neg_accuracy = tb_results['neg_match']/tb_results['neg_count']*100
    neutral_percent = tb_results['neu_count']/(tb_results['neg_count'] + tb_results['pos_count'])*100
    print('positive accuracy: %.2f' % pos_accuracy)
    print('negative accuracy: %.2f' % neg_accuracy)
    print('neutral percent: %.2f' % neutral_percent)

textblob_threshold = 0
vader_threshold = 0

def textblobAnalyze(sentence):
    analysis = TextBlob(sentence)
    polarity = analysis.sentiment.polarity
    score = 0
    if polarity >= textblob_threshold:
        score = 1
    elif polarity < 0:
        score = -1
        
    return score

v_analyzer = SentimentIntensityAnalyzer()
def vaderAnalyze(sentence):
    analysis = v_analyzer.polarity_scores(sentence)
    raw_score = analysis['compound']
    score = 0
    if raw_score >= vader_threshold:
        score = 1
    elif raw_score < 0:
        score = -1
        
    return score

test_sentence = 'Today is a horrible day.'

# print(textblobAnalyze(test_sentence))
# print(vaderAnalyze(test_sentence))

tb_start = time()
tb_results = checkSentimentAccuracy(training_df['tweet'], training_df['label'], textblobAnalyze)
tb_diff = time() - tb_start
v_start = time()
v_results = checkSentimentAccuracy(training_df['tweet'], training_df['label'], vaderAnalyze)
v_diff = time() - v_start
print('__ TextBlob Results __')
printAccuracyResults(tb_results)
print('elapsed time: %i' % tb_diff)
print('__ Vader Results __')
printAccuracyResults(v_results)
print('elapsed time: %i' % v_diff)

__ TextBlob Results __
positive accuracy: 86.07
negative accuracy: 25.91
neutral percent: 0.00
elapsed time: 6
__ Vader Results __
positive accuracy: 86.07
negative accuracy: 25.91
neutral percent: 0.00
elapsed time: 5


In [4]:
print(v_analyzer.polarity_scores('Today is a horrible day.'))

{'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}


### Conclusion

Given the results above, I am not sure I can rely on a negative analysis, but at least the positive match accuracy is above 80% and that will have to do. Also when tested with the string 'Today is a horrible day' it does return a negative result.

VADER is slightly faster, so I will use that module as it has aditional analysis data that might be useful. I used 'compound' for the test, but there is also a negative, positive, and neutral score.

It should also be noted that textblob does other useful tasks like tokenization and subjectivity scoring which could also be useful.