# POS Tagging Classifier

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1
    suffix_fdist[word[-2:]] +=1
    suffix_fdist[word[-3:]] +=1

In [2]:
common_suffixes = []
for suffix in suffix_fdist.most_common(100):
    common_suffixes.append(str(suffix.__getitem__(0)))
print (common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [3]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix]=word.lower().endswith(suffix)
    return features

In [4]:
pos_features('lovely')
pos_features('expansion')

{'endswith(e)': False,
 'endswith(,)': False,
 'endswith(.)': False,
 'endswith(s)': False,
 'endswith(d)': False,
 'endswith(t)': False,
 'endswith(he)': False,
 'endswith(n)': True,
 'endswith(a)': False,
 'endswith(of)': False,
 'endswith(the)': False,
 'endswith(y)': False,
 'endswith(r)': False,
 'endswith(to)': False,
 'endswith(in)': False,
 'endswith(f)': False,
 'endswith(o)': False,
 'endswith(ed)': False,
 'endswith(nd)': False,
 'endswith(is)': False,
 'endswith(on)': True,
 'endswith(l)': False,
 'endswith(g)': False,
 'endswith(and)': False,
 'endswith(ng)': False,
 'endswith(er)': False,
 'endswith(as)': False,
 'endswith(ing)': False,
 'endswith(h)': False,
 'endswith(at)': False,
 'endswith(es)': False,
 'endswith(or)': False,
 'endswith(re)': False,
 'endswith(it)': False,
 'endswith(``)': False,
 'endswith(an)': False,
 "endswith('')": False,
 'endswith(m)': False,
 'endswith(;)': False,
 'endswith(i)': False,
 'endswith(ly)': False,
 'endswith(ion)': True,
 'endswit

In [5]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [6]:
brown.sents()[0]
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [7]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []

In [8]:
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

In [9]:
for f in featuresets[:10]:
    print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [10]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
len(train_set)

90499

In [11]:
len(test_set)

10055

In [12]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

# Sentence Segmentation

In [13]:
sents = nltk.corpus.treebank_raw.sents()
sents[:10]

[['.', 'START'],
 ['Pierre',
  'Vinken',
  ',',
  '61',
  'years',
  'old',
  ',',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'Nov',
  '.',
  '29',
  '.'],
 ['Mr',
  '.',
  'Vinken',
  'is',
  'chairman',
  'of',
  'Elsevier',
  'N',
  '.',
  'V',
  '.,',
  'the',
  'Dutch',
  'publishing',
  'group',
  '.'],
 ['.', 'START'],
 ['Rudolph',
  'Agnew',
  ',',
  '55',
  'years',
  'old',
  'and',
  'former',
  'chairman',
  'of',
  'Consolidated',
  'Gold',
  'Fields',
  'PLC',
  ',',
  'was',
  'named',
  'a',
  'nonexecutive',
  'director',
  'of',
  'this',
  'British',
  'industrial',
  'conglomerate',
  '.'],
 ['.', 'START'],
 ['A',
  'form',
  'of',
  'asbestos',
  'once',
  'used',
  'to',
  'make',
  'Kent',
  'cigarette',
  'filters',
  'has',
  'caused',
  'a',
  'high',
  'percentage',
  'of',
  'cancer',
  'deaths',
  'among',
  'a',
  'group',
  'of',
  'workers',
  'exposed',
  'to',
  'it',
  'more',
  'than',
  '30',
  'years',


In [14]:
tokens = [ ]
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset - 1)
    
tokens[:30]

['.',
 'START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov',
 '.',
 '29',
 '.',
 'Mr',
 '.',
 'Vinken',
 'is',
 'chairman',
 'of',
 'Elsevier',
 'N',
 '.']

In [15]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
        'prevword': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [16]:
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens) - 1)
    if tokens[i] in '.?!']

In [17]:
size = int(len(Sfeaturesets) * 0.1)
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)
nltk.classify.accuracy(Sclassifier, Stest_set)

0.936026936026936

In [18]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and Sclassifier.classify(punct_features(words, i)) == True:
                sents.append(words[start:i+1])
                start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [19]:
len(tokens)

101797

In [20]:
smalltokens = tokens[:1000]

In [21]:
for s in segment_sentences(smalltokens):
    print (s)

['.']
['START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', '.', 'START', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.']
['The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'b

# Text Classification (aka Text Categorization)

In [22]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import random
movie_reviews.categories()

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


['neg', 'pos']

In [23]:
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

In [24]:
random.shuffle(documents)

In [25]:
documents[0]

(['as',
  'bad',
  'as',
  '"',
  'mimic',
  '"',
  'was',
  ',',
  'it',
  'definitly',
  'scared',
  'me',
  '.',
  '"',
  'mimic',
  '"',
  'continued',
  'the',
  'frightening',
  'hollywood',
  'trend',
  'of',
  'taking',
  'a',
  'foreign',
  'director',
  'who',
  'shows',
  'a',
  'lot',
  'of',
  'creative',
  'ingenuity',
  'and',
  'style',
  ',',
  'and',
  'completely',
  'flushing',
  'it',
  'when',
  'he',
  'comes',
  'to',
  'america',
  'to',
  'make',
  'a',
  'movie',
  '.',
  'director',
  'guillermo',
  'del',
  'toro',
  'was',
  'recently',
  'imported',
  'from',
  'mexico',
  'after',
  'he',
  'made',
  'the',
  'award',
  'winning',
  'and',
  'inventive',
  'horror',
  'film',
  '"',
  'cronos',
  '.',
  '"',
  'that',
  'movie',
  'worked',
  'because',
  'it',
  'took',
  'an',
  'unconventional',
  'story',
  ',',
  'unique',
  'characters',
  'and',
  'well',
  'written',
  'plot',
  'and',
  'dialogue',
  'to',
  'create',
  'a',
  'truly',
  'scary'

In [26]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]

list(word_features)[:100]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'see',
 'him',
 'in',
 'her',
 'life',
 'has',
 'nightmares',
 'what',
 "'",
 's',
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 'find',
 'out',
 'critique',
 'mind',
 '-',
 'fuck',
 'for',
 'generation',
 'that',
 'touches',
 'on',
 'very',
 'cool',
 'idea',
 'presents',
 'it',
 'bad',
 'package',
 'which',
 'is',
 'makes',
 'this',
 'review',
 'even',
 'harder',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'such',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 'there',
 'are',
 'good',
 'ways',
 'making',
 'all',
 'types',
 'these',
 'folks']

In [27]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [28]:
featuresets = [(document_features(d), c) for (d,c) in documents]
featuresets[0]

({'contains(plot)': True,
  'contains(:)': True,
  'contains(two)': True,
  'contains(teen)': False,
  'contains(couples)': False,
  'contains(go)': False,
  'contains(to)': True,
  'contains(a)': True,
  'contains(church)': False,
  'contains(party)': False,
  'contains(,)': True,
  'contains(drink)': False,
  'contains(and)': True,
  'contains(then)': True,
  'contains(drive)': False,
  'contains(.)': True,
  'contains(they)': True,
  'contains(get)': True,
  'contains(into)': False,
  'contains(an)': True,
  'contains(accident)': False,
  'contains(one)': True,
  'contains(of)': True,
  'contains(the)': True,
  'contains(guys)': False,
  'contains(dies)': False,
  'contains(but)': True,
  'contains(his)': True,
  'contains(girlfriend)': False,
  'contains(continues)': False,
  'contains(see)': True,
  'contains(him)': False,
  'contains(in)': True,
  'contains(her)': False,
  'contains(life)': False,
  'contains(has)': True,
  'contains(nightmares)': False,
  'contains(what)': True,

In [29]:
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.83


In [30]:
classifier.show_most_informative_features(20)

Most Informative Features
           contains(ugh) = True              neg : pos    =      9.0 : 1.0
    contains(schumacher) = True              neg : pos    =      7.4 : 1.0
        contains(shoddy) = True              neg : pos    =      7.0 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.0 : 1.0
        contains(suvari) = True              neg : pos    =      7.0 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0
        contains(turkey) = True              neg : pos    =      6.3 : 1.0
       contains(jumbled) = True              neg : pos    =      6.3 : 1.0
        contains(wasted) = True              neg : pos    =      5.9 : 1.0
        contains(justin) = True              neg : pos    =      5.8 : 1.0
       contains(singers) = True              pos : neg    =      5.7 : 1.0
       contains(bronson) = True              neg : pos    =      5.6 : 1.0