# Predict Gender Using Only Last Letter

In [1]:
import nltk

# define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}

print(gender_features('Shrek'))

{'last_letter': 'k'}


In [2]:
# resource for male and female first names
from nltk.corpus import names
nltk.download('names')

print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


In [3]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

print(len(namesgender))
print(namesgender[:20])
print(namesgender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [4]:
# put the list into random order
import random

random.shuffle(namesgender)
print(namesgender[:20])

[('Wandie', 'female'), ('Giacomo', 'male'), ('Emma', 'female'), ('Theodoric', 'male'), ('Melosa', 'female'), ('Keslie', 'female'), ('Correy', 'female'), ('Marion', 'male'), ('Bartel', 'male'), ('Ginni', 'female'), ('Mordecai', 'male'), ('Germana', 'female'), ('Chris', 'male'), ('Philippine', 'female'), ('Louie', 'male'), ('Joby', 'female'), ('Barth', 'male'), ('Aziz', 'male'), ('Reina', 'female'), ('Jonah', 'male')]


In [5]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]

In [6]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]

print(train_set[:20])

[({'last_letter': 'l'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 's'}, 'female'), ({'last_letter': 's'}, 'male'), ({'last_letter': 't'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'o'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'h'}, 'female'), ({'last_letter': 'l'}, 'female')]


In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

# classify accuracy function runs the classifier on the test set and reports

# comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

male
female
0.752


In [8]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))

Most Informative Features
             last_letter = 'k'              male : female =     44.6 : 1.0
             last_letter = 'a'            female : male   =     34.5 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0
             last_letter = 'd'              male : female =      9.8 : 1.0
             last_letter = 'm'              male : female =      9.3 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      6.9 : 1.0
             last_letter = 'g'              male : female =      5.5 : 1.0
             last_letter = 'w'              male : female =      4.5 : 1.0
             last_letter = 't'              male : female =      4.1 : 1.0
             last_letter = 's'              male : female =      4.1 : 1.0

In [9]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

124


In [10]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))
        
printerrors(errors)

correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Carolann                      
correct=female   guess=male     name=Charleen                      
correct=female   guess=male     name=Charmion                      
correct=female   guess=male     name=Clem                          
correct=female   guess=male     name=Damaris                       
correct=female   guess=male     name=Demeter                       
correct=female   guess=male     name=Diann                         
correct=female   guess=male     name=Ethelyn                       
correct=female   guess=male     name=Fallon     

# Predict Gender Using Last Two Letters

In [11]:
# define a feature extraction function for each name
def gender_features2(word):
    return{'last_two': word[-2:]}

print(gender_features2('Shrek'))

{'last_two': 'ek'}


In [12]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features2(n), g) for (n, g) in train_names]
test_set = [(gender_features2(n), g) for (n, g) in test_names]

print(train_set[:20])

[({'last_two': 'al'}, 'female'), ({'last_two': 'tt'}, 'male'), ({'last_two': 'is'}, 'female'), ({'last_two': 'os'}, 'male'), ({'last_two': 'it'}, 'female'), ({'last_two': 'ne'}, 'female'), ({'last_two': 'in'}, 'male'), ({'last_two': 'ie'}, 'male'), ({'last_two': 'ee'}, 'female'), ({'last_two': 'ue'}, 'female'), ({'last_two': 'ky'}, 'male'), ({'last_two': 'be'}, 'female'), ({'last_two': 'ie'}, 'male'), ({'last_two': 'ie'}, 'male'), ({'last_two': 'io'}, 'male'), ({'last_two': 'ee'}, 'female'), ({'last_two': 'ri'}, 'female'), ({'last_two': 'il'}, 'male'), ({'last_two': 'ah'}, 'female'), ({'last_two': 'll'}, 'female')]


In [13]:
classifier2 = nltk.NaiveBayesClassifier.train(train_set)

# classify new instances
print(classifier2.classify(gender_features2('Neo')))
print(classifier2.classify(gender_features2('Trinity')))

# classify accuracy function runs the classifier on the test set and reports

# comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier2, test_set))

male
female
0.784


In [14]:
# this function available for naive bayes classifiers
print(classifier2.show_most_informative_features(20))

Most Informative Features
                last_two = 'na'           female : male   =    100.2 : 1.0
                last_two = 'la'           female : male   =     74.8 : 1.0
                last_two = 'ia'           female : male   =     39.6 : 1.0
                last_two = 'sa'           female : male   =     36.5 : 1.0
                last_two = 'rd'             male : female =     33.2 : 1.0
                last_two = 'us'             male : female =     27.5 : 1.0
                last_two = 'ra'           female : male   =     26.8 : 1.0
                last_two = 'do'             male : female =     26.2 : 1.0
                last_two = 'ta'           female : male   =     25.6 : 1.0
                last_two = 'ld'             male : female =     25.1 : 1.0
                last_two = 'rt'             male : female =     22.7 : 1.0
                last_two = 'os'             male : female =     17.3 : 1.0
                last_two = 'io'             male : female =     16.4 : 1.0

In [15]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier2.classify(gender_features2(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

108


In [16]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))
        
printerrors(errors)

correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Charmion                      
correct=female   guess=male     name=Clem                          
correct=female   guess=male     name=Demeter                       
correct=female   guess=male     name=Eve                           
correct=female   guess=male     name=Fallon                        
correct=female   guess=male     name=Frank                         
correct=female   guess=male     name=Franky                        
correct=female   guess=male     name=Honor                         
correct=female   guess=male     name=Janifer                       
correct=female   guess=male     name=Jean                          
correct=female   guess=male     name=Karin                         
correct=female   guess=male     name=Kerstin    