In [1]:
import nltk

### Supervised Classification

### Gender Identification

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [3]:
gender_features('Shrek')

{'last_letter': 'k'}

In [4]:
## feature names are case-sensitive strings # feature values are values with simple types, such as, booleans, numbers, strings.

In [5]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in
                  names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [6]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [7]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
classifier.classify(gender_features('Neo'))

'male'

In [10]:
classifier.classify(gender_features('Emily'))

'female'

In [11]:
print(nltk.classify.accuracy(classifier, test_set))

0.79


In [12]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.2 : 1.0
             last_letter = 'k'              male : female =     31.6 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


Modify the gender_features() function to provide the classifier with features encoding the length of the name, its first letter, and any other features that seem like they might be informative. Retrain the classifier with these new features, and test its accuracy.

In [13]:
def gender_features2(word):
    return {'first_letter': word[0]}
gender_features2('ben')

{'first_letter': 'b'}

In [14]:
def gender_features3(word):
    return len(word)

In [15]:
gender_features3('Emma')

4

In [16]:
def gender_features(word):
    return {'Length of name': len(word), 'First_letter is: ': word[0]}

In [17]:
gender_features('Kate')

{'Length of name': 4, 'First_letter is: ': 'K'}

In [18]:
from nltk.classify import apply_features

In [19]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
 [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [20]:
#train_set, test_set = featuresets[500:], featuresets[:500]

In [21]:
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [22]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [23]:
classifier.classify(gender_features('reema'))

'female'

In [24]:
classifier.classify(gender_features('peter'))

'female'

In [25]:
print(nltk.classify.accuracy(classifier, test_set))

0.636


In [26]:
classifier.show_most_informative_features(3)

Most Informative Features
             last_letter = 'a'            female : male   =     34.2 : 1.0
             last_letter = 'k'              male : female =     31.6 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0


In [27]:
## use this for larger corpora. this acts like a list but does not stores all the feature sets in memory

### Choosing The Right Features

In [28]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [29]:
gender_features2('peter')

{'first_letter': 'p',
 'last_letter': 'r',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 2,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 0,
 'has(h)': False,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 1,
 'has(p)': True,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 1,
 'has(t)': True,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [30]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.776


In [31]:
## feeding in too many features may lead to Overfitting, we can refine the feature set with the help of error analysis, 1. select a development set containing the corpus data for creating the model and development set is then sub-divided into the training set and the dev_test set. 

In [32]:
train_names = labeled_names[1500:] # train model
devtest_names = labeled_names[500:1500] # error analysis
test_names = labeled_names[:500] # final evaluation of the system

In [33]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]

In [34]:
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]

In [35]:
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [36]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [37]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.624


In [38]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name))

In [39]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Dee                           
correct=female   guess=male     name=Del                           
correct=female   guess=male     name=Evy                           
correct=female   guess=male     name=Fan                           
correct=female   guess=male     name=Gen                           
correct=female   guess=male     name=Halette                       
correct=female   guess=male     name=Halie                         
correct=female   guess=male     name=Halli                         
correct=female   guess=male     name=Hanni                         
correct=female   guess=male     name=Happy                         
correct=female   guess=male     name=Harmonie                      
correct=female   guess=male     name=Harriette                     
correct=female   guess=male     name=Harriot                       
correct=female   guess=male     name=Hedi                          
correct=female   guess=male     name=Hedwig     

In [40]:
def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}

In [41]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.783


In [42]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [43]:
for (tag, guess, name) in sorted(errors):
     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Angil                         
correct=female   guess=male     name=Annabell                      
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Aubrey                        
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Blakeley                      
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Carlin                        
correct=female   guess=male     name=Cassandry  

In [44]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [45]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.783


### Document Classification

In [46]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [47]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [48]:
word_features = list(all_words)[:2000] 

def document_features(document): # we check whether the word is in the document as it is faster than checking the list
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [49]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [50]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [51]:
print(nltk.classify.accuracy(classifier, test_set)) 

0.77


In [52]:
classifier.show_most_informative_features(5)

Most Informative Features
        contains(sexist) = True              neg : pos    =      7.7 : 1.0
        contains(welles) = True              neg : pos    =      7.7 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.0 : 1.0
    contains(schumacher) = True              neg : pos    =      7.0 : 1.0


### Part Of Speech Tagging

In [55]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    

In [56]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [57]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [58]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
        return features

In [59]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

In [60]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [61]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.17135753356539035

In [62]:
classifier.classify(pos_features('cats'))

'IN'

In [63]:
print(classifier.pseudocode(depth=5))

if endswith(e) == False: return 'IN'
if endswith(e) == True: return 'AT'



### Exploiting Context

In [65]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:], 
               "suffix(2)": sentence[i][-2:], 
               "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [66]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [67]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

### Sequence Classification

In [73]:
def pos_features(sentence, i, history): 
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 5)

In [74]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

NameError: name 'ConsecutivePosTagger' is not defined

### More Examples Of SuperVised Classification

#### Sentence Segmentation

In [None]:
## Sentence Segmentation : Classification task for punctuation

In [79]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [80]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [81]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [82]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [83]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

### Identifying Dialogue Act Types

In [84]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [85]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [88]:
#posts.get('class')

In [89]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
                for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668


### Recoginizing Textual Entailment

It is the task of determining whether a given piece of text T entails anothr text called the "hypothesis".

In [90]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    feature = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features 

In [91]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])

In [95]:
extractor = nltk.RTEFeatureExtractor(rtepair)
#print(extractor.text_words)

AttributeError: 'list' object has no attribute 'text'

In [94]:
print(extractor.hyp_words)

NameError: name 'extractor' is not defined

In [96]:
import random

In [97]:
from nltk.corpus import brown

In [98]:
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [99]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [100]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

### Accuracy

In [102]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack (expected 2)

### Confusion Matrices

In [103]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

NameError: name 't2' is not defined

### Decision Trees

A decision tree is a simple flowchart that selects labels for input values. This flowchart consists of decision nodes, which check feature values, and leaf nodes, which assign labels. To choose the label for an input value, we begin at the flowchart's initial decision node, known as its root node. This node contains a condition that checks one of the input value's features, and selects a branch based on that feature's value.A decision stump is a decision tree with a single node that decides how to classify inputs based on a single feature.

### Entropy And Information Gain

In [104]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [105]:
print(entropy(['male', 'female', 'male', 'male']))

0.8112781244591328


In [None]:
### Naive Bayes Classification

### Maximum Entropy Classifiers 