# Learning to Classify Text

<b> Supervised Classification

Classification is the task of choosing the correct class label for a given input. In basic classification tasks, each input is considered in isolation from all other inputs, and the set of labels is defined in advance. Some examples of classification tasks are:

Deciding whether an email is spam or not.
Deciding what the topic of a news article is, from a fixed list of topic areas such as "sports," "technology," and "politics."
Deciding whether a given occurrence of the word bank is used to refer to a river bank, a financial institution, the act of tilting to the side, or the act of depositing something in a financial institution.

A classifier is called supervised if it is built based on training corpora containing the correct label for each input.

<b> Gender Identification

In [1]:
def gender_features(word):
    return {'last_letter':word[-1]}

gender_features('Shrek')

{'last_letter': 'k'}

In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])


In [3]:
import random
random.shuffle(labeled_names)

In [4]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [5]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [6]:
import nltk
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_features('Neo'))

'male'

In [8]:
classifier.classify(gender_features('Trinity'))

'female'

In [9]:
print(nltk.classify.accuracy(classifier, test_set))

0.758


In [10]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.5 : 1.0
             last_letter = 'k'              male : female =     32.2 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


In [11]:
from nltk.classify import apply_features

In [12]:
train_set = apply_features(gender_features, labeled_names[500:])
train_set

[({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ...]

In [13]:
test_set = apply_features(gender_features, labeled_names[:500])
test_set

[({'last_letter': 'l'}, 'female'), ({'last_letter': 'a'}, 'female'), ...]

<b> Choosing the Right Feature

<b> A Feature Extractor that Overfits Gender Features </b> The feature sets returned by this feature extractor contain a large number of specific features, leading to overfitting for the relatively small Names Corpus.

In [14]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [15]:
gender_features2('John') 

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [16]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

In [17]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [18]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [19]:
print(nltk.classify.accuracy(classifier, test_set))

0.746


In [20]:
train_names = labeled_names[1500:]

In [21]:
devtest_names = labeled_names[500:1500]

In [22]:
test_names = labeled_names[:500]

In [23]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set))

0.79


In [24]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [25]:
for (tag, guess, name) in sorted(errors):
     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Astrix                        
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Ayn                           
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brit                          
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Carlen                        
correct=female   guess=male     name=Carol      

In [26]:
def gender_features(word):
     return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [27]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.796


<b> Document Classification

In [28]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [29]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [30]:
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [31]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'conta

In [32]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [33]:
print(nltk.classify.accuracy(classifier, test_set))

0.79


In [34]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     17.4 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.3 : 1.0
        contains(seagal) = True              neg : pos    =      7.0 : 1.0
          contains(lame) = True              neg : pos    =      6.0 : 1.0


<b> Part-of-Speech Tagging

In [1]:
from nltk.corpus import brown

In [36]:
suffix_fdist = nltk.FreqDist()

for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [37]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [38]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [39]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [40]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [41]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [None]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

In [None]:
classifier.classify(pos_features('cats'))

In [None]:
print(classifier.pseudocode(depth=4))

<b> Exploiting Context

In [5]:
def pos_features(sentence, i): 
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [6]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [7]:
tagged_sents = brown.tagged_sents(categories='news')

In [8]:
import nltk
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )


In [9]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

<b> Sequence Classification

In [11]:
def pos_features(sentence, i, history):
    
    features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
      
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else: 
     
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features
       

class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [12]:
tagged_sents = brown.tagged_sents(categories='news')

In [13]:
size = int(len(tagged_sents) * 0.1)

In [14]:
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

In [15]:
tagger = ConsecutivePosTagger(train_sents)

In [16]:
print(tagger.evaluate(test_sents))

0.7980528511821975


<b> Sentence Segmentation

In [17]:
sents = nltk.corpus.treebank_raw.sents()

In [18]:
tokens = []

In [19]:
boundaries = set()

In [20]:
offset = 0

In [21]:
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [22]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [23]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [24]:
size = int(len(featuresets) * 0.1)

In [25]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [26]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [27]:
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

<b> Classification Based Sentence Segmenter

In [28]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

<b>  Identifying Dialogue Act Types

In [29]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [30]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [31]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]

In [32]:
size = int(len(featuresets) * 0.1)

In [33]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [34]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [35]:
print(nltk.classify.accuracy(classifier, test_set))

0.667


<b> Recognizing Textual Entailment

In [36]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [37]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

In [38]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [39]:
print(extractor.text_words)

{'Co', 'was', 'Organisation', 'terrorism.', 'binds', 'Davudi', 'China', 'former', 'meeting', 'association', 'Parviz', 'Soviet', 'four', 'republics', 'Iran', 'central', 'that', 'together', 'at', 'Russia', 'Shanghai', 'operation', 'fledgling', 'fight', 'SCO', 'Asia', 'representing'}


In [40]:
print(extractor.hyp_words)

{'China', 'SCO.', 'member'}


In [41]:
print(extractor.overlap('word'))

set()


In [42]:
print(extractor.overlap('ne'))

{'China'}


In [44]:
print(extractor.hyp_extra('word'))

{'member'}


<b> Evaluation

<b> The Test Set

In [45]:
import random
from nltk.corpus import brown

In [46]:
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [47]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [48]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

<b> Accuracy

In [50]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack (expected 2)

<b> Confusion Matrices

In [43]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]

In [44]:
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [45]:
gold = tag_list(brown.tagged_sents(categories='editorial'))

In [47]:
brown_tagged_sents=brown.tagged_sents(categories='news')
size=int(len(brown_tagged_sents)*0.9)
train_sents=brown_tagged_sents[:size]
t0=nltk.DefaultTagger('NN')
backoff=nltk.DefaultTagger('NN')
t1=nltk.UnigramTagger(train_sents,backoff=t0)
t2=nltk.BigramTagger(train_sents,backoff=t1)

test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))

In [48]:
cm = nltk.ConfusionMatrix(gold, test)

In [49]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |                                         N                      |
    |      N      I      A      J             N             V      N |
    |      N      N      T      J      .      S      ,      B      P |
----+----------------------------------------------------------------+
 NN | <11.9%>  0.0%      .   0.2%      .   0.0%      .   0.2%   0.0% |
 IN |   0.0%  <9.0%>     .      .      .   0.0%      .      .      . |
 AT |      .      .  <8.6%>     .      .      .      .      .      . |
 JJ |   1.6%      .      .  <4.0%>     .      .      .   0.0%   0.0% |
  . |      .      .      .      .  <4.8%>     .      .      .      . |
NNS |   1.5%      .      .      .      .  <3.3%>     .      .   0.0% |
  , |      .      .      .      .      .      .  <4.4%>     .      . |
 VB |   0.9%      .      .   0.0%      .      .      .  <2.4%>     . |
 NP |   1.0%      .      .   0.0%      .      .      .      .  <1.9%>|
----+----------------------------------------------------------------+
(row =

<b> Entropy and Information Gain

In [50]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [51]:
print(entropy(['male', 'male', 'male', 'male'])) 

-0.0


In [52]:
print(entropy(['female', 'male', 'female', 'male']))

1.0


In [53]:
print(entropy(['female', 'female', 'male', 'female']))

0.8112781244591328


In [54]:
print(entropy(['female', 'female', 'female', 'female'])) 

-0.0
