In [7]:
import nltk, re, pprint
from nltk.corpus import names
import random
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [0]:
names =  ([(name, 'male') for name in names.words('male.txt')] + 
          [(name, 'female') for name in names.words('female.txt')])

In [0]:
random.shuffle(names)

In [0]:
def gender_features(word):
    return {'last_letter': word[-1]}


In [0]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [16]:
classifier.classify(gender_features('Neo'))

'male'

In [20]:
classifier.classify(gender_features('Trinity'))

'female'

In [21]:
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [22]:
classifier.show_most_informative_features()

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     33.0 : 1.0
             last_letter = 'v'              male : female =     18.5 : 1.0
             last_letter = 'f'              male : female =     16.5 : 1.0
             last_letter = 'p'              male : female =     11.1 : 1.0
             last_letter = 'm'              male : female =      9.9 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0


In [0]:
from nltk.classify import apply_features


In [0]:
train_set = apply_features(gender_features, names[500:])
test_set= apply_features(gender_features, names[:500])

In [67]:
train_set[1]

({'last_letter': 'o'}, 'male')

In [0]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [55]:
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [56]:
classifier.show_most_informative_features()

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     33.0 : 1.0
             last_letter = 'v'              male : female =     18.5 : 1.0
             last_letter = 'f'              male : female =     16.5 : 1.0
             last_letter = 'p'              male : female =     11.1 : 1.0
             last_letter = 'm'              male : female =      9.9 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0


In [0]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["secondlastletter"] = name[-2:].lower()
    features["lastletter"] = name[-1].lower()
    for letter in "abcdefghijklmnopqrstuvwxyz":
        features['count(%s)' % letter] = name.lower().count(letter)
        features['has(%s)' % letter] = (letter in name.lower())
    return features

In [76]:
train_names = names[1500:]
valid_names = names[500:1500]
test_names = names[:500]

train_set2 = apply_features(gender_features2, train_names)
valid_set2 = apply_features(gender_features2, valid_names)
test_set2 = apply_features(gender_features2, test_names)

classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

print(nltk.classify.accuracy(classifier2, valid_set2))
print(nltk.classify.accuracy(classifier2, test_set2))
classifier2.show_most_informative_features()

0.799
0.794
Most Informative Features
        secondlastletter = 'na'           female : male   =     84.6 : 1.0
        secondlastletter = 'la'           female : male   =     68.4 : 1.0
        secondlastletter = 'ia'           female : male   =     34.8 : 1.0
              lastletter = 'a'            female : male   =     30.5 : 1.0
        secondlastletter = 'sa'           female : male   =     30.1 : 1.0
              lastletter = 'k'              male : female =     28.0 : 1.0
        secondlastletter = 'rd'             male : female =     27.5 : 1.0
        secondlastletter = 'do'             male : female =     24.0 : 1.0
        secondlastletter = 'us'             male : female =     23.2 : 1.0
        secondlastletter = 'io'             male : female =     22.8 : 1.0


In [0]:
errors = []
for (name, tag) in valid_names:
    guess = classifier2.classify(gender_features2(name))
    if guess!= tag:
        errors.append((tag, guess, name))

In [73]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' %
                (tag, guess, name))

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Amber                         
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Berthe                        
correct=female   guess=male     name=Berty                         
correct=female   guess=male     name=Bethany                       
correct=female   guess=male     name=Biddy                         
correct=female   guess=male     name=Bo                            
correct=female   guess=male     name=Brandy                        
correct=female   guess=male     name=Bren                          
correct=female   guess=male     name=Buffy                         
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Carrol                        
correct=female   guess=male     name=Cher                          
correct=female   guess=male     name=Cherry     