In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import movie_reviews
import random

#### 1. Start out by exploring the NLTK movie reviews corpus (described in chapter 6, section 1.3) and familiarizing yourself, which the book didn't do. You know how to do this! Figuring out how big the corpus is, how many reviews there are, and how many of them are positive/negative would be a bare minimum.

In [3]:
# Read in movie dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [19]:
print("There are {} reviews.".format(len(documents)))

There are 2000 reviews.


In [13]:
# There are two categories: negative and positive reviews
movie_reviews.categories()

['neg', 'pos']

In [17]:
print("There are {} positive and {} negative reviews".format(len(movie_reviews.fileids('pos')), len(movie_reviews.fileids('pos'))))

There are 1000 positive and 1000 negative reviews


In [20]:
# Frequency Distribution of all words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [21]:
# extract 2,000 most common words
word_features = [w for (w,f) in all_words.most_common(2000)]

In [22]:
# Most informative features: feature extractor that checks whether each of these words is present in a given document

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [23]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [24]:
print(nltk.classify.accuracy(classifier, test_set))

0.75


In [25]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
          contains(lame) = True              neg : pos    =      6.5 : 1.0


In [26]:
classifier.show_most_informative_features(10)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
          contains(lame) = True              neg : pos    =      6.5 : 1.0
         contains(damon) = True              pos : neg    =      6.4 : 1.0
         contains(awful) = True              neg : pos    =      6.2 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.7 : 1.0
         contains(waste) = True              neg : pos    =      5.7 : 1.0


In [27]:
classifier.show_most_informative_features(20)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
          contains(lame) = True              neg : pos    =      6.5 : 1.0
         contains(damon) = True              pos : neg    =      6.4 : 1.0
         contains(awful) = True              neg : pos    =      6.2 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.7 : 1.0
         contains(waste) = True              neg : pos    =      5.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.3 : 1.0
          contains(jedi) = True              pos : neg    =      5.1 : 1.0
           contains(era) = True              pos : neg    =      5.0 : 1.0

In [29]:
# Our own Fake Movie Review
myreview = """Mr. Matt Damon was outstanding, fantastic, excellent, wonderfully subtle, superb, terrific, and memorable in his portrayal of Mulan."""  

In [30]:
# Tokenize myreview
myreview_toks = nltk.word_tokenize(myreview.lower())

In [31]:
# Tokenized myreview
myreview_toks

['mr.',
 'matt',
 'damon',
 'was',
 'outstanding',
 ',',
 'fantastic',
 ',',
 'excellent',
 ',',
 'wonderfully',
 'subtle',
 ',',
 'superb',
 ',',
 'terrific',
 ',',
 'and',
 'memorable',
 'in',
 'his',
 'portrayal',
 'of',
 'mulan',
 '.']

In [32]:
myreview_feats = document_features(myreview_toks) 

In [41]:
list(myreview_feats.items())[:10]

[('contains(,)', True),
 ('contains(the)', False),
 ('contains(.)', True),
 ('contains(a)', False),
 ('contains(and)', True),
 ('contains(of)', True),
 ('contains(to)', False),
 ("contains(')", False),
 ('contains(is)', False),
 ('contains(in)', True)]

In [42]:
classifier.classify(myreview_feats)

'pos'

In [43]:
classifier.prob_classify(myreview_feats).prob('pos')

0.7256751236987142

In [44]:
classifier.prob_classify(myreview_feats).prob('neg')

0.2743248763012792

In [45]:
myreview2 = \
"""Mr. Steven Seagal was outstanding, fantastic, excellent, wonderfully subtle, superb, terrific, and memorable in his portrayal of Mulan."""  

In [46]:
myreview2_toks = nltk.word_tokenize(myreview2.lower())

In [47]:
myreview2_feats = document_features(myreview2_toks)

In [48]:
classifier.classify(myreview2_feats)

'neg'

In [49]:
classifier.prob_classify(myreview2_feats).prob('pos')

0.019333353569168855

In [50]:
classifier.prob_classify(myreview2_feats).prob('neg')

0.9806666464307993

In [51]:
myreview3 = "Mr. Matt Damon was outstanding, fantastic."   

In [52]:
myreview3_toks = nltk.word_tokenize(myreview3.lower())

In [53]:
myreview3_feats = document_features(myreview3_toks)

In [54]:
classifier.classify(myreview3_feats)

'neg'

In [55]:
classifier.prob_classify(myreview3_feats).prob('pos')

1.0285817257496308e-06

In [56]:
classifier.prob_classify(myreview3_feats).prob('neg')

0.9999989714182644

This surprising result comes from the fact that under this particular classifier model, all reviews, long or short, get represented by exactly the same set of 2,000 presence/absence word features. Even though this short review has 8 word tokens, there are (at least) 1,992 other features also simultaneously voting for 'pos' and 'neg' labels. In this case, these "absent" word features voted heavily towards 'neg' (e.g., enjoyed was not found, therefore up the 'neg' prediction); the presence of Matt, Damon, outstanding, fantastic -- all strong features towards 'pos' -- didn't have enough collective sway.