# Binary Sentiment Analysis

## Imports

In [34]:
import nltk
import random
from nltk.corpus import movie_reviews

## Getting the Data

In [17]:
reviews = [(list(movie_reviews.words(fileid)), category) 
           for category in movie_reviews.categories() 
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)

In [19]:
words = [w.lower() for w in movie_reviews.words()]

In [21]:
frqs = nltk.FreqDist(words)
frqs.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

## Making Data for Training
### Taking the Top 3000
`.keys()` just returns the words. We will take the top 3000 of them

In [38]:
word_features = list(frqs.keys())[:3000]

### Map words
We map words to a boolean classifier that says if the word is a part of the top 3000 occouring words or not

In [61]:
def find_features(reviews):
    words = set(reviews)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [66]:
find_features(movie_reviews.words('neg/cv004_12641.txt'))

{'plot': True,
 ':': True,
 'two': True,
 'teen': False,
 'couples': False,
 'go': False,
 'to': True,
 'a': True,
 'church': False,
 'party': False,
 ',': True,
 'drink': False,
 'and': True,
 'then': True,
 'drive': False,
 '.': True,
 'they': True,
 'get': False,
 'into': False,
 'an': True,
 'accident': True,
 'one': True,
 'of': True,
 'the': True,
 'guys': False,
 'dies': False,
 'but': True,
 'his': True,
 'girlfriend': False,
 'continues': False,
 'see': True,
 'him': False,
 'in': True,
 'her': True,
 'life': False,
 'has': True,
 'nightmares': False,
 'what': True,
 "'": True,
 's': True,
 'deal': False,
 '?': True,
 'watch': False,
 'movie': True,
 '"': True,
 'sorta': False,
 'find': False,
 'out': True,
 'critique': False,
 'mind': False,
 '-': True,
 'fuck': False,
 'for': True,
 'generation': False,
 'that': True,
 'touches': False,
 'on': True,
 'very': False,
 'cool': False,
 'idea': False,
 'presents': False,
 'it': True,
 'bad': True,
 'package': False,
 'which': Tru

Do it for all reviews

In [63]:
featuresets = [(find_features(rev), catergory) for (rev, catergory) in reviews]

### Splitting into Train and Test
We take the 8--20 split

In [80]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

## The Classifier
Naive Bayes just identifies and associated the words with classes - in our case, positive and negative

In [81]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [82]:
nltk.classify.accuracy(classifier, training_set)

0.8978947368421053

In [85]:
nltk.classify.accuracy(classifier, testing_set)

0.77

### Result
We got a 77% accuracy on the test set. Pretty good for a classifier as simple as this one

So what were the words that decided the classifier?

In [84]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                  stinks = True              neg : pos    =      9.7 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 idiotic = True              neg : pos    =      7.5 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
             silverstone = True              neg : pos    =      6.3 : 1.0
                 kidding = True              neg : pos    =      6.3 : 1.0

## Saving the Model 
We will pickle it for later uses

In [86]:
import pickle
save = open('assets/models/4-nativebayes.pickle', 'wb')
pickle.dump(classifier, save)
save.close()

Let's load it and see if it worked...

In [88]:
saved = open('assets/models/4-nativebayes.pickle', 'rb')
classifier = pickle.load(saved)
nltk.classify.accuracy(classifier, testing_set)

0.77