# Text Classification: pos or neg

In [1]:
### classify documents based on keywords
import nltk
from nltk.corpus import movie_reviews
import random
nltk.download('movie_reviews')

# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())

['neg', 'pos']


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
print(len(documents))

2000


In [3]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

(['this', 'is', 'the', 'best', 'british', 'gangster', 'film', 'since', 'the', 'long', 'good', 'friday', '.', 'jon', 'bennet', '(', 'played', 'by', 'andrew', 'howard', ')', 'is', 'an', 'extremely', 'good', 'assassin', '.', 'as', 'such', 'he', 'is', 'probably', 'an', 'evil', 'man', 'but', 'it', 'does', 'not', 'worry', 'him', '.', 'he', 'has', 'become', 'an', 'unquestioning', 'weapon', '.', 'he', 'is', 'in', 'the', 'employ', 'of', 'a', 'kingpin', '(', 'david', 'calder', ')', 'far', 'more', 'evil', 'than', 'he', 'is', '.', 'but', 'even', 'assassins', 'have', 'innocent', 'pasts', '.', 'he', 'runs', 'into', 'an', 'old', 'from', 'school', 'days', 'and', 'he', 'is', 'married', 'to', 'a', 'mutual', 'girl', 'friend', 'from', 'school', '.', 'complicating', 'matters', 'is', 'that', 'they', 'live', 'near', 'where', 'he', 'had', 'a', 'recent', 'job', 'and', 'their', 'young', 'daughter', 'may', 'have', 'seen', 'him', 'at', 'the', 'crime', '.', 'a', 'big', 'piece', 'of', 'what', 'makes', 'this', 'film

In [4]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))

39768


In [5]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items] # just the words

# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [6]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_%s' % word] = (word in document_words)
    return features

# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

# the feature sets are 2000 words long - so this is optional
print(featuresets[0])

({'V_,': True, 'V_the': True, 'V_.': True, 'V_a': True, 'V_and': True, 'V_of': True, 'V_to': True, "V_'": True, 'V_is': True, 'V_in': True, 'V_s': True, 'V_"': True, 'V_it': True, 'V_that': True, 'V_-': True, 'V_)': True, 'V_(': True, 'V_as': True, 'V_with': True, 'V_for': False, 'V_his': True, 'V_this': True, 'V_film': True, 'V_i': True, 'V_he': True, 'V_but': True, 'V_on': True, 'V_are': False, 'V_t': True, 'V_by': True, 'V_be': False, 'V_one': True, 'V_movie': False, 'V_an': True, 'V_who': True, 'V_not': True, 'V_you': False, 'V_from': True, 'V_at': True, 'V_was': True, 'V_have': True, 'V_they': True, 'V_has': True, 'V_her': False, 'V_all': True, 'V_?': False, 'V_there': False, 'V_like': True, 'V_so': False, 'V_out': True, 'V_about': False, 'V_up': False, 'V_more': True, 'V_what': True, 'V_when': False, 'V_which': False, 'V_or': False, 'V_she': False, 'V_their': True, 'V_:': False, 'V_some': False, 'V_just': True, 'V_can': False, 'V_if': False, 'V_we': True, 'V_him': True, 'V_into':

In [7]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.88


In [8]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))

Most Informative Features
           V_outstanding = True              pos : neg    =     11.0 : 1.0
                 V_mulan = True              pos : neg    =      9.0 : 1.0
                V_seagal = True              neg : pos    =      8.2 : 1.0
           V_wonderfully = True              pos : neg    =      7.5 : 1.0
                 V_damon = True              pos : neg    =      7.0 : 1.0
                V_wasted = True              neg : pos    =      6.1 : 1.0
                 V_flynt = True              pos : neg    =      5.7 : 1.0
                  V_lame = True              neg : pos    =      5.6 : 1.0
                 V_awful = True              neg : pos    =      5.4 : 1.0
                V_poorly = True              neg : pos    =      5.2 : 1.0
            V_ridiculous = True              neg : pos    =      5.0 : 1.0
                 V_waste = True              neg : pos    =      5.0 : 1.0
                   V_era = True              pos : neg    =      4.5 : 1.0

# Rerun with a different number of words

In [9]:
# get the 500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(500)
word_features = [word for (word, freq) in word_items] # just the words

# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [10]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_%s' % word] = (word in document_words)
    return features

# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

# the feature sets are 2000 words long - so this is optional
print(featuresets[0])

({'V_,': True, 'V_the': True, 'V_.': True, 'V_a': True, 'V_and': True, 'V_of': True, 'V_to': True, "V_'": True, 'V_is': True, 'V_in': True, 'V_s': True, 'V_"': True, 'V_it': True, 'V_that': True, 'V_-': True, 'V_)': True, 'V_(': True, 'V_as': True, 'V_with': True, 'V_for': False, 'V_his': True, 'V_this': True, 'V_film': True, 'V_i': True, 'V_he': True, 'V_but': True, 'V_on': True, 'V_are': False, 'V_t': True, 'V_by': True, 'V_be': False, 'V_one': True, 'V_movie': False, 'V_an': True, 'V_who': True, 'V_not': True, 'V_you': False, 'V_from': True, 'V_at': True, 'V_was': True, 'V_have': True, 'V_they': True, 'V_has': True, 'V_her': False, 'V_all': True, 'V_?': False, 'V_there': False, 'V_like': True, 'V_so': False, 'V_out': True, 'V_about': False, 'V_up': False, 'V_more': True, 'V_what': True, 'V_when': False, 'V_which': False, 'V_or': False, 'V_she': False, 'V_their': True, 'V_:': False, 'V_some': False, 'V_just': True, 'V_can': False, 'V_if': False, 'V_we': True, 'V_him': True, 'V_into':

In [11]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.72


In [12]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))

Most Informative Features
                 V_worst = True              neg : pos    =      4.5 : 1.0
                    V_it = False             neg : pos    =      2.6 : 1.0
              V_supposed = True              neg : pos    =      2.4 : 1.0
               V_perfect = True              pos : neg    =      2.3 : 1.0
                V_others = True              pos : neg    =      2.1 : 1.0
                   V_war = True              pos : neg    =      2.0 : 1.0
                 V_maybe = True              neg : pos    =      2.0 : 1.0
                   V_bad = True              neg : pos    =      2.0 : 1.0
                     V_= = True              neg : pos    =      1.9 : 1.0
                  V_true = True              pos : neg    =      1.9 : 1.0
          V_performances = True              pos : neg    =      1.9 : 1.0
              V_american = True              pos : neg    =      1.8 : 1.0
             V_different = True              pos : neg    =      1.7 : 1.0