# Sentiment classification with VADER

## Load the IMDB dataset
Only the test data is loaded, since VADER does not require training data

In [9]:
from keras.utils.data_utils import get_file
test_file = get_file('imdb_test.txt', origin='https://goo.gl/mg8bsD', cache_subdir='data')

In [3]:
import csv
x_test = []
y_test = []
with open(test_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter='\t')
    for row in reader:
        x_test.append(row[0])
        y_test.append(int(row[1]))
x_test[0], y_test[0]

("Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.",
 0)

In [4]:
len(x_test)

25001

## Create a VADER classifier

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [6]:
vader.polarity_scores('a nightmare from hell')

{'compound': -0.6808, 'neg': 0.697, 'neu': 0.303, 'pos': 0.0}

In [8]:
vader.polarity_scores('not a nightmare from hell')

{'compound': 0.5667, 'neg': 0.0, 'neu': 0.45, 'pos': 0.55}

In [33]:
vader.polarity_scores(':(')

{'compound': -0.4404, 'neg': 1.0, 'neu': 0.0, 'pos': 0.0}

## Classification of test data

In [17]:
scores = []
for text in x_test:
    scores.append(vader.polarity_scores(text)['compound'])

In [18]:
list(zip(scores, y_test))[:20]

[(-0.5349, 0),
 (0.6582, 0),
 (-0.897, 0),
 (0.9306, 0),
 (0.9695, 0),
 (0.7879, 0),
 (-0.9694, 0),
 (0.7293, 0),
 (-0.4692, 0),
 (0.9487, 0),
 (0.3496, 0),
 (0.9771, 0),
 (0.417, 0),
 (-0.4915, 0),
 (-0.9821, 0),
 (-0.8345, 0),
 (0.5294, 0),
 (0.2753, 0),
 (0.575, 0),
 (0.8033, 0)]

## Evaluation of accuracy

In [35]:
accuracy = 0
for prediction,correct in zip(scores, y_test):
    if (prediction > 0) == correct:
        accuracy += 1
accuracy/len(scores)

0.6985720571177153