In [1]:
# Classification tasks with different types of features
>>> import nltk

In [2]:
# classify part of speech based on sentence context
>>> from nltk.corpus import brown

In [3]:
# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [4]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
sentence0[8]

[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.']


u'investigation'

In [5]:
# pos features of the word 8 
pos_features(sentence0, 8)

{'prev-word': u'an',
 'suffix(1)': u'n',
 'suffix(2)': u'on',
 'suffix(3)': u'ion'}

In [6]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
print(tag_sent0)

[(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u"Atlanta's", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u"''", u"''"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')]


In [7]:
# the function nltk.tag.untag will take the tags off
print(nltk.tag.untag(tag_sent0))

[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.']


In [8]:
# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

(0, u'The', u'AT')
(1, u'Fulton', u'NP-TL')
(2, u'County', u'NN-TL')
(3, u'Grand', u'JJ-TL')
(4, u'Jury', u'NN-TL')
(5, u'said', u'VBD')
(6, u'Friday', u'NR')
(7, u'an', u'AT')
(8, u'investigation', u'NN')
(9, u'of', u'IN')
(10, u"Atlanta's", u'NP$')
(11, u'recent', u'JJ')
(12, u'primary', u'NN')
(13, u'election', u'NN')
(14, u'produced', u'VBD')
(15, u'``', u'``')
(16, u'no', u'AT')
(17, u'evidence', u'NN')
(18, u"''", u"''")
(19, u'that', u'CS')
(20, u'any', u'DTI')
(21, u'irregularities', u'NNS')
(22, u'took', u'VBD')
(23, u'place', u'NN')
(24, u'.', u'.')


In [9]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

In [10]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)

({'suffix(3)': u'The', 'prev-word': '<START>', 'suffix(2)': u'he', 'suffix(1)': u'e'}, u'AT')
({'suffix(3)': u'ton', 'prev-word': u'The', 'suffix(2)': u'on', 'suffix(1)': u'n'}, u'NP-TL')
({'suffix(3)': u'nty', 'prev-word': u'Fulton', 'suffix(2)': u'ty', 'suffix(1)': u'y'}, u'NN-TL')
({'suffix(3)': u'and', 'prev-word': u'County', 'suffix(2)': u'nd', 'suffix(1)': u'd'}, u'JJ-TL')
({'suffix(3)': u'ury', 'prev-word': u'Grand', 'suffix(2)': u'ry', 'suffix(1)': u'y'}, u'NN-TL')
({'suffix(3)': u'aid', 'prev-word': u'Jury', 'suffix(2)': u'id', 'suffix(1)': u'd'}, u'VBD')
({'suffix(3)': u'day', 'prev-word': u'said', 'suffix(2)': u'ay', 'suffix(1)': u'y'}, u'NR')
({'suffix(3)': u'an', 'prev-word': u'Friday', 'suffix(2)': u'an', 'suffix(1)': u'n'}, u'AT')
({'suffix(3)': u'ion', 'prev-word': u'an', 'suffix(2)': u'on', 'suffix(1)': u'n'}, u'NN')
({'suffix(3)': u'of', 'prev-word': u'investigation', 'suffix(2)': u'of', 'suffix(1)': u'f'}, u'IN')


In [11]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

90499
10055


In [12]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy (this will take a little while)
nltk.classify.accuracy(classifier, test_set)
# the result is reasonable for POS features without the previous tag

0.7891596220785678

In [13]:
### sentence segmentation
sents = nltk.corpus.treebank_raw.sents()
len(sents)
for sent in sents[:7]:
    print (sent)

[u'.', u'START']
[u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.']
[u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.']
[u'.', u'START']
[u'Rudolph', u'Agnew', u',', u'55', u'years', u'old', u'and', u'former', u'chairman', u'of', u'Consolidated', u'Gold', u'Fields', u'PLC', u',', u'was', u'named', u'a', u'nonexecutive', u'director', u'of', u'this', u'British', u'industrial', u'conglomerate', u'.']
[u'.', u'START']
[u'A', u'form', u'of', u'asbestos', u'once', u'used', u'to', u'make', u'Kent', u'cigarette', u'filters', u'has', u'caused', u'a', u'high', u'percentage', u'of', u'cancer', u'deaths', u'among', u'a', u'group', u'of', u'workers', u'exposed', u'to', u'it', u'more', u'than', u'30', u'years', u'ago', u',', u'researchers', u'reported', u'.']


In [14]:
# initialize an empty token list, an empty boundaries set and offset as the integer 0
tokens = [ ]
boundaries = set()
offset = 0
# make a list of tokens with sentence boundaries
#   the offset is set to the index of a sentence boundary
for sent in nltk.corpus.treebank_raw.sents():
      tokens.extend(sent)
      offset += len(sent)
      boundaries.add(offset - 1)

In [15]:
# look at tokens and boundaries
print(tokens[:40])
print(len(boundaries))
print(0 in boundaries)
print(1 in boundaries)
print(19 in boundaries)
print(20 in boundaries)
for num, tok in enumerate(tokens[:40]):
     print (num, tok, '\t', num in boundaries)

[u'.', u'START', u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.', u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.', u'.', u'START', u'Rudolph']
4193
False
True
False
True
(0, u'.', '\t', False)
(1, u'START', '\t', True)
(2, u'Pierre', '\t', False)
(3, u'Vinken', '\t', False)
(4, u',', '\t', False)
(5, u'61', '\t', False)
(6, u'years', '\t', False)
(7, u'old', '\t', False)
(8, u',', '\t', False)
(9, u'will', '\t', False)
(10, u'join', '\t', False)
(11, u'the', '\t', False)
(12, u'board', '\t', False)
(13, u'as', '\t', False)
(14, u'a', '\t', False)
(15, u'nonexecutive', '\t', False)
(16, u'director', '\t', False)
(17, u'Nov', '\t', False)
(18, u'.', '\t', False)
(19, u'29', '\t', False)
(20, u'.', '\t', True)
(21, u'Mr', '\t', False)
(22, u'.', '\t', False)
(23, u'Vinken', '\t', False)


In [16]:
# feature extraction function
# token is a list of words and we get the features of the token at offset i
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
        'prevword': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [17]:
# feature dictionary for the period at index 20
print(tokens[20])
punct_features(tokens,20)

.


{'next-word-capitalized': True,
 'prev-word-is-one-char': False,
 'prevword': u'29',
 'punct': u'.'}

In [18]:
# Define featuresets of all candidate punctuation
#  (read the list comprehension "outside-in")
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
      for i in range(1, len(tokens) - 1)
      if tokens[i] in '.?!']

In [19]:
# look at the feature sets of the first 10 punctuation symbols
for sf in Sfeaturesets[:10]:
	print (sf)

({'next-word-capitalized': False, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'nov'}, False)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'29'}, True)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'mr'}, False)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': True, 'prevword': u'n'}, False)
({'next-word-capitalized': False, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'group'}, True)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': True, 'prevword': u'.'}, False)
({'next-word-capitalized': False, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'conglomerate'}, True)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': True, 'prevword': u'.'}, False)
({'next-word-capitalized': True, 'punct': u'.', 'prev-word-is-one-char': False, 'prevword': u'reported'}, True)
({'next-word-capitalized

In [20]:
# separate into training and test sets with a 90/10 split
size = int(len(Sfeaturesets) * 0.1)
size

594

In [21]:
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)
nltk.classify.accuracy(Sclassifier, Stest_set)

0.936026936026936

In [22]:
# define function to use the trained classifier to label sentences
def segment_sentences(words):
      start = 0
      sents = []
      for i, word in enumerate(words):
          if word in '.?!' and Sclassifier.classify(punct_features(words, i)) == True:
              sents.append(words[start:i+1])
              start = i+1
      if start < len(words):
          sents.append(words[start:])
      return sents

In [23]:
# try it out on a subset of the tokens from the treebank
print(len(tokens))
print(tokens[:50])

tinytokens = tokens[:1000]
for s in segment_sentences(tinytokens):
    print (s)

101797
[u'.', u'START', u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.', u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.', u'.', u'START', u'Rudolph', u'Agnew', u',', u'55', u'years', u'old', u'and', u'former', u'chairman', u'of', u'Consolidated']
[u'.']
[u'START', u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.', u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.']
[u'.', u'START', u'Rudolph', u'Agnew', u',', u'55', u'years', u'old', u'and', u'former', u'chairman', u'of', u'Consolidated', u'Gold', u'Fields', u'PLC', u',', u'was', u'named', u'a', u'nonexecutive', u'director', u'of', u'this

In [24]:
# compare to NLKT default sentence tokenizer, which works on raw text instead of tokens
from nltk.tokenize import sent_tokenize

In [25]:
# this sentence segmenter starts with raw text, instead of tokens
rawtext = 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.  Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.'
sents = nltk.sent_tokenize(rawtext)
sents

['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
 'Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.']

In [26]:
## classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [27]:
# movie reviews are labeled either positive or negative (by human annotators)
movie_reviews.categories()

[u'neg', u'pos']

In [28]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
len(documents)

2000

In [29]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

([u'sometimes', u'a', u'stellar', u'cast', u'can', u'compensate', u'for', u'a', u'lot', u'of', u'things', u',', u'and', u'"', u'pushing', u'tin', u'"', u'certainly', u'features', u'some', u'name', u'stars', u'who', u'are', u'going', u'places', u':', u'billy', u'bob', u'thornton', u',', u'cate', u'blanchett', u',', u'angelina', u'jolie', u',', u'and', u'oh', u'yes', u'john', u'cusack', u'who', u'might', u'not', u'realize', u'it', u'at', u'first', u',', u'but', u'he', u"'", u's', u'actually', u'the', u'*', u'veteran', u'*', u'among', u'this', u'quartet', u'of', u'fine', u'-', u'looking', u'people', u'.', u'sometimes', u'a', u'terrific', u'cast', u'like', u'this', u'can', u'compensate', u'for', u'a', u'lackluster', u'screen', u'treatment', u'of', u'an', u'idea', u'that', u'has', u'"', u'hip', u'comedy', u'"', u'written', u'all', u'over', u'it', u',', u'compensate', u'for', u'workmanlike', u'but', u'uninspired', u'direction', u',', u'compensate', u'for', u'an', u'obnoxious', u'score', u'th

In [30]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
len(all_words)

39768

In [78]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(5005)
word_features = [word for (word, freq) in word_items]   # just the words

# look at the first 100 words
print(word_features[:100])

[u',', u'the', u'.', u'a', u'and', u'of', u'to', u"'", u'is', u'in', u's', u'"', u'it', u'that', u'-', u')', u'(', u'as', u'with', u'for', u'his', u'this', u'film', u'i', u'he', u'but', u'on', u'are', u't', u'by', u'be', u'one', u'movie', u'an', u'who', u'not', u'you', u'from', u'at', u'was', u'have', u'they', u'has', u'her', u'all', u'?', u'there', u'like', u'so', u'out', u'about', u'up', u'more', u'what', u'when', u'which', u'or', u'she', u'their', u':', u'some', u'just', u'can', u'if', u'we', u'him', u'into', u'even', u'only', u'than', u'no', u'time', u'good', u'most', u'its', u'will', u'story', u'would', u'been', u'much', u'character', u'also', u'get', u'other', u'do', u'two', u'well', u'them', u'very', u'characters', u';', u'first', u'--', u'after', u'see', u'!', u'way', u'because', u'make', u'life']


In [79]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['contains(%s)' % word] = (word in document_words)
	return features

In [80]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

# the feature sets are 2000 words long - so this is optional
featuresets[0]

({u'contains(waste)': False,
  u'contains(menacing)': False,
  u'contains(bag)': False,
  u'contains(happily)': False,
  u'contains(lot)': True,
  u'contains(twisted)': False,
  u'contains(slater)': False,
  u'contains(*)': True,
  u'contains(commercials)': False,
  u'contains(debate)': False,
  u'contains(black)': False,
  u'contains(sand)': False,
  u'contains(rated)': False,
  u'contains(conventions)': False,
  u'contains(copy)': False,
  u'contains(mcconaughey)': False,
  u'contains(potential)': False,
  u'contains(response)': False,
  u'contains(frequency)': False,
  u'contains(m)': False,
  u'contains(reaches)': False,
  u'contains(verhoeven)': False,
  u'contains(understand)': False,
  u'contains(cost)': False,
  u'contains(disguise)': False,
  u'contains(everywhere)': False,
  u'contains(drug)': False,
  u'contains(1991)': False,
  u'contains(enters)': False,
  u'contains(campaign)': False,
  u'contains(case)': False,
  u'contains(reed)': False,
  u'contains(floating)': False,


In [81]:
# training using naive Baysian classifier with a 90/10 split
train_set, test_set = featuresets[200:], featuresets[:200]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.835


In [61]:
# show which features of classifier are most informative
classifier.show_most_informative_features(30)

Most Informative Features
         contains(worst) = True              neg : pos    =      4.7 : 1.0
        contains(stupid) = True              neg : pos    =      3.7 : 1.0
     contains(excellent) = True              pos : neg    =      3.4 : 1.0
        contains(boring) = True              neg : pos    =      3.4 : 1.0
     contains(perfectly) = True              pos : neg    =      2.7 : 1.0
     contains(effective) = True              pos : neg    =      2.6 : 1.0
     contains(brilliant) = True              pos : neg    =      2.5 : 1.0
      contains(supposed) = True              neg : pos    =      2.4 : 1.0
     contains(wonderful) = True              pos : neg    =      2.4 : 1.0
         contains(solid) = True              pos : neg    =      2.3 : 1.0
          contains(wars) = True              pos : neg    =      2.3 : 1.0
            contains(it) = False             neg : pos    =      2.3 : 1.0
       contains(perfect) = True              pos : neg    =      2.2 : 1.0