In [1]:
import nltk

## Gender Identification

In [2]:
def gender_features(word):
    return {'length': len(word),'first_letter':word[:1],'last_letter':word[-1]}
gender_features('Shrek')

{'length': 5, 'first_letter': 'S', 'last_letter': 'k'}

In [3]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\SAYED\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [4]:
#how to shuffle data in text files
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
names

[('Saundra', 'male'),
 ('Galina', 'female'),
 ('Arthur', 'male'),
 ('Rickie', 'male'),
 ('Vance', 'male'),
 ('Rahul', 'male'),
 ('Deb', 'female'),
 ('Jodie', 'male'),
 ('Warner', 'male'),
 ('Piotr', 'male'),
 ('Merola', 'female'),
 ('Pincus', 'male'),
 ('Raynard', 'male'),
 ('Abigail', 'female'),
 ('Marta', 'female'),
 ('Hans-Peter', 'male'),
 ('Kelsy', 'female'),
 ('Theodora', 'female'),
 ('Raymund', 'male'),
 ('Melinda', 'female'),
 ('Shea', 'female'),
 ('Nessie', 'female'),
 ('Adolphe', 'male'),
 ('Shelden', 'male'),
 ('Barbee', 'female'),
 ('Dasya', 'female'),
 ('Hetty', 'female'),
 ('Hewet', 'male'),
 ('Anthony', 'male'),
 ('Zuzana', 'female'),
 ('Irita', 'female'),
 ('Ambrosia', 'female'),
 ('Sibley', 'female'),
 ('Sheba', 'female'),
 ('Lorine', 'female'),
 ('Dara', 'female'),
 ('Cary', 'female'),
 ('Irvine', 'male'),
 ('Helli', 'female'),
 ('Genevieve', 'female'),
 ('Davidde', 'male'),
 ('Tammi', 'female'),
 ('Griselda', 'female'),
 ('Janenna', 'female'),
 ('Mickie', 'male'),
 (

In [5]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [6]:
train_set

[({'length': 5, 'first_letter': 'M', 'last_letter': 'a'}, 'female'),
 ({'length': 3, 'first_letter': 'I', 'last_letter': 'a'}, 'female'),
 ({'length': 7, 'first_letter': 'C', 'last_letter': 'e'}, 'female'),
 ({'length': 7, 'first_letter': 'M', 'last_letter': 'a'}, 'female'),
 ({'length': 10, 'first_letter': 'G', 'last_letter': 'a'}, 'female'),
 ({'length': 6, 'first_letter': 'S', 'last_letter': 'y'}, 'male'),
 ({'length': 6, 'first_letter': 'V', 'last_letter': 'y'}, 'male'),
 ({'length': 5, 'first_letter': 'C', 'last_letter': 'l'}, 'female'),
 ({'length': 13, 'first_letter': 'J', 'last_letter': 's'}, 'male'),
 ({'length': 7, 'first_letter': 'H', 'last_letter': 'y'}, 'female'),
 ({'length': 5, 'first_letter': 'T', 'last_letter': 'a'}, 'female'),
 ({'length': 6, 'first_letter': 'D', 'last_letter': 'e'}, 'female'),
 ({'length': 5, 'first_letter': 'D', 'last_letter': 'y'}, 'male'),
 ({'length': 5, 'first_letter': 'G', 'last_letter': 'a'}, 'female'),
 ({'length': 6, 'first_letter': 'P', 'la

In [7]:
print(classifier.classify(gender_features('Bob')))

print(classifier.classify(gender_features('Lily')))

male
female


In [8]:
print(nltk.classify.accuracy(classifier, test_set))

0.744


In [9]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'k'              male : female =     45.5 : 1.0
             last_letter = 'a'            female : male   =     35.6 : 1.0
             last_letter = 'f'              male : female =     16.5 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0


## Choosing Right features

In [10]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [11]:
gender_features2('Shrek')

{'firstletter': 's',
 'lastletter': 'k',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 1,
 'has(k)': True,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 1,
 'has(s)': True,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [12]:
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [13]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [14]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [15]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.778


In [16]:
##Using the dev-test set, we can generate a list of the errors that the classifier makes when predicting name genders:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [17]:
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Adelind                       
correct=female   guess=male     name=Adriaens                      
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Clo                           
correct=female   guess=male     name=Dawn                          
correct=female   guess=male     name=Doloritas                     
correct=female   guess=male     name=Dorit                         
correct=female   guess=male     name=Ester      

### adjust our feature extractor to include features for two-letter suffixes:

In [18]:
def gender_features(word):
    return {'suffix1': word[-1:],'suffix2': word[-2:]}

In [19]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.792


### Document Classification

In [20]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\SAYED\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [21]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [22]:
#Example 6-2. A feature extractor for document classification, whose features indicate whether or not individual words are present in a given document.
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [23]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [24]:
## Example 6-3. Training and testing a classifier for document classification.
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [25]:
print(nltk.classify.accuracy(classifier,test_set))

0.77


In [26]:
classifier.show_most_informative_features(5)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.4 : 1.0
          contains(mena) = True              neg : pos    =      7.1 : 1.0
        contains(shoddy) = True              neg : pos    =      7.1 : 1.0
        contains(suvari) = True              neg : pos    =      7.1 : 1.0
    contains(schumacher) = True              neg : pos    =      7.1 : 1.0


## POS Tagging

In [36]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]]+=1
    suffix_fdist[word[-2:]]+=1
    suffix_fdist[word[-3:]]+=1
    
common_suffixes = list(suffix_fdist.keys())[:100]
print(common_suffixes)

['e', 'he', 'the', 'n', 'on', 'ton', 'y', 'ty', 'nty', 'd', 'nd', 'and', 'ry', 'ury', 'id', 'aid', 'ay', 'day', 'an', 'ion', 'f', 'of', 's', "'s", "a's", 't', 'nt', 'ent', 'ary', 'ed', 'ced', '`', '``', 'o', 'no', 'ce', 'nce', "'", "''", 'at', 'hat', 'ny', 'any', 'es', 'ies', 'k', 'ok', 'ook', 'ace', '.', 'r', 'er', 'her', 'in', 'end', 'ts', 'nts', 'ity', 've', 'ive', 'ee', 'tee', ',', 'h', 'ch', 'ich', 'ad', 'had', 'l', 'll', 'all', 'ge', 'rge', 'ves', 'se', 'ise', 'ks', 'nks', 'a', 'ta', 'nta', 'or', 'for', 'ner', 'as', 'was', 'ted', 'ber', 'm', 'rm', 'erm', 'en', 'een', 'ged', 'by', 'ior', 'rt', 'urt', 'dge', 'od']


In [37]:
len(common_suffixes)

100

In [38]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

In [39]:
pos_features('lily')

{'endswith(e)': False,
 'endswith(he)': False,
 'endswith(the)': False,
 'endswith(n)': False,
 'endswith(on)': False,
 'endswith(ton)': False,
 'endswith(y)': True,
 'endswith(ty)': False,
 'endswith(nty)': False,
 'endswith(d)': False,
 'endswith(nd)': False,
 'endswith(and)': False,
 'endswith(ry)': False,
 'endswith(ury)': False,
 'endswith(id)': False,
 'endswith(aid)': False,
 'endswith(ay)': False,
 'endswith(day)': False,
 'endswith(an)': False,
 'endswith(ion)': False,
 'endswith(f)': False,
 'endswith(of)': False,
 'endswith(s)': False,
 "endswith('s)": False,
 "endswith(a's)": False,
 'endswith(t)': False,
 'endswith(nt)': False,
 'endswith(ent)': False,
 'endswith(ary)': False,
 'endswith(ed)': False,
 'endswith(ced)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(o)': False,
 'endswith(no)': False,
 'endswith(ce)': False,
 'endswith(nce)': False,
 "endswith(')": False,
 "endswith('')": False,
 'endswith(at)': False,
 'endswith(hat)': False,
 'endswith(ny

In [40]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets

[({'endswith(e)': True,
   'endswith(he)': True,
   'endswith(the)': True,
   'endswith(n)': False,
   'endswith(on)': False,
   'endswith(ton)': False,
   'endswith(y)': False,
   'endswith(ty)': False,
   'endswith(nty)': False,
   'endswith(d)': False,
   'endswith(nd)': False,
   'endswith(and)': False,
   'endswith(ry)': False,
   'endswith(ury)': False,
   'endswith(id)': False,
   'endswith(aid)': False,
   'endswith(ay)': False,
   'endswith(day)': False,
   'endswith(an)': False,
   'endswith(ion)': False,
   'endswith(f)': False,
   'endswith(of)': False,
   'endswith(s)': False,
   "endswith('s)": False,
   "endswith(a's)": False,
   'endswith(t)': False,
   'endswith(nt)': False,
   'endswith(ent)': False,
   'endswith(ary)': False,
   'endswith(ed)': False,
   'endswith(ced)': False,
   'endswith(`)': False,
   'endswith(``)': False,
   'endswith(o)': False,
   'endswith(no)': False,
   'endswith(ce)': False,
   'endswith(nce)': False,
   "endswith(')": False,
   "endswith

In [41]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [42]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.5689706613625062


In [53]:
classifier.classify(pos_features('cities'))

'NNS'

In [43]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(was) == False: return 'PP$'
      if endswith(was) == True: return 'BEDZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



## Exploiting Context

Example 6-4. A part-of-speech classifier whose feature detector examines the context in which a word
appears in order to determine which part-of-speech tag should be assigned. In particular, the identity
of the previous word is included as a feature.

In [54]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
    "suffix(2)": sentence[i][-2:],
    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [58]:
pos_features(brown.sents()[0],8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [59]:
tagged_sents = brown.tagged_sents(categories='news')
tagged_sents

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [60]:
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag))

In [66]:
size=int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayesClassifier.train(train_set)

In [67]:
print(nltk.classify.accuracy(classifier,test_set))

0.7891596220785678


## Sequence Classification

- In order to capture the dependencies between related classification tasks, we can use
joint classifier models, which choose an appropriate labeling for a collection of related
inputs. In the case of part-of-speech tagging, a variety of different sequence
classifier models can be used to jointly choose part-of-speech tags for all the words in
a given sentence.


- One sequence classification strategy, known as consecutive classification or greedy
sequence classification, is to find the most likely class label for the first input, then
to use that answer to help find the best label for the next input. The process can then
be repeated until all of the inputs have been labeled

In [156]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
        "suffix(2)": sentence[i][-2:],
        "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

In [159]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self,train_sents):
        train_set=[]
        for tagged_sent in train_sents:
            untagged_sent=nltk.tag.untag(tagged_sent)
            history=[]
            for i,(word,tag) in enumerate(tagged_sent):
                feature_set=pos_features(untagged_sent,i,history)
                train_set.append((feature_set,tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)    
    
    def tag(self,sentence):
        history=[]
        for i,word in enumerate(sentence):
            featureset=pos_features(sentence,i,history)
            tag=self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence,history)

In [161]:
tagged_sents=brown.tagged_sents(categories='news')
size=int(len(tagged_sents)*0.1)
train_sents,test_sents=tagged_sents[size:],tagged_sents[:size]
tagger=ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(tagger.evaluate(test_sents))


0.7980528511821975


In [181]:
train_sents

[[('He', 'PPS'), ('assured', 'VBD'), ('Mr.', 'NP'), ('Martinelli', 'NP'), ('and', 'CC'), ('the', 'AT'), ('council', 'NN'), ('that', 'CS'), ('he', 'PPS'), ('would', 'MD'), ('study', 'VB'), ('the', 'AT'), ('correct', 'JJ'), ('method', 'NN'), ('and', 'CC'), ('report', 'VB'), ('back', 'RB'), ('to', 'IN'), ('the', 'AT'), ('council', 'NN'), ('as', 'QL'), ('soon', 'RB'), ('as', 'CS'), ('possible', 'JJ'), ('.', '.')], [('Mr.', 'NP'), ('Martinelli', 'NP'), ('said', 'VBD'), ('yesterday', 'NR'), ('that', 'CS'), ('the', 'AT'), ('Citizens', 'NNS-TL'), ('Group', 'NN-TL'), ('of', 'IN-TL'), ('Johnston', 'NP-TL'), ('will', 'MD'), ('meet', 'VB'), ('again', 'RB'), ('July', 'NP'), ('24', 'CD'), ('to', 'TO'), ('plan', 'VB'), ('further', 'JJR'), ('strategy', 'NN'), ('in', 'IN'), ('the', 'AT'), ('charter', 'NN'), ('movement', 'NN'), ('.', '.')], ...]

In [187]:
tagger.evaluate([[('run','VBD')]])

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  tagger.evaluate([[('run','VBD')]])


0.0

#### FOR NP parsing

<!-- class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutivePosTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags) -->

### NOTES

- Another solution is to assign scores to all of the possible sequences of part-of-speech
tags, and to choose the sequence whose overall score is highest. This is the approach
taken by ##Hidden Markov Models.


- These probabilities are then combined to calculate
probability scores for tag sequences, and the tag sequence with the highest probability
is chosen.


- Given a tag set with 30 tags, there are about 600 trillion (3010) ways to label a 10-word sentence.
In order to avoid considering all these possible sequences separately, Hidden Markov
Models require that the feature extractor only look at the most recent tag

## Sequence Segmentation

In [190]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\SAYED\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [191]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [192]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
    'prevword': tokens[i-1].lower(),
    'punct': tokens[i],
    'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [193]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens)-1)
if tokens[i] in '.?!']

In [194]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.9461279461279462


In [196]:
# Example 6-6. Classification-based sentence segmenter.

def segment_sentences(words):
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(words, i) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])

In [214]:
## Unigram tagging

from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [215]:
unigram_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger.evaluate(brown_tagged_sents)


0.9349006503968017

In [221]:
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger.evaluate(test_sents)


0.8121200039868434

In [222]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

In [223]:
### Confusion Matrix 
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]

def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2,brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print(cm)

           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            