# Learning to Classify Text #

https://www.nltk.org/book/ch06.html

## Supervised Classification ##

##  Further Examples of Supervised Classification ##

## Evaluation ##

## Decision Trees ##

## Naive Bayes Classifiers ##

## Maximum Entropy Classifiers ##

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Shrek')

{'last_letter': 'k'}

In [3]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [5]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [6]:
classifier.classify(gender_features('Neo'))

'male'

In [7]:
classifier.classify(gender_features('Trinity'))

'female'

In [8]:
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [9]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.3 : 1.0
             last_letter = 'k'              male : female =     32.2 : 1.0
             last_letter = 'f'              male : female =     27.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


In [10]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [11]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [12]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.778


In [13]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [14]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set))

0.748


In [15]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [16]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Adrien                        
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Allison                       
correct=female   guess=male     name=Ardys                         
correct=female   guess=male     name=Arlen                         
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Beitris                       
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Brooks                        
correct=female   guess=male     name=Calypso    

In [17]:
def gender_features(word):
    return {'suffix1': word[-1:],
             'suffix2': word[-2:]}

In [18]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.782
